[Devel] [PATCH] mm: high order allocation detector
Nikita Yushchenko
nikita.yushchenko at virtuozzo.com
Tue Aug 31 14:28:47 MSK 2021
High order allocation detector monitors allocations of order greater
than zero, and generates an uevent if a configured number of allocations
happen within configured time.
In return to this uevent, userspace can enable event tracing. If a
stream of high-order allocations continues, the trace could help to
detect the code path causing them.
HOAD has a sysfs control interface, at /sys/kernel/mm/hoad/control:
- "enable ORDER COUNT MSECS"
Sets up monitoring allocations of order ORDER: if COUNT such
allocations are detected within MSECS, uevent is sent. Then further
uevents is suspended, to avoid userspace races.
- "disable ORDER"
Stops monitoring allocations of order ORDER.
- "resume [delay-msecs]"
Allow sending a new uevent, either immediately or after the given
delay.
The uevent is generated with ACTION="change", SUBSYSTEM="hoad", ORDER
set to thr order of the allocation that has caused the uevent.
Also HOAD provides a tracepoint named "hoad", under kmem/ group, that
could be used for tracing. This tracepoint hits on every allocation of
order greater or equal to minimal order for which monitoring is enabled.
https://jira.sw.ru/browse/PSBM-92088
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
include/trace/events/kmem.h | 12 ++
mm/page_alloc.c | 257 ++++++++++++++++++++++++++++++++++++
2 files changed, 269 insertions(+)
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 9cb647609df3..b425c6856bfd 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -305,6 +305,18 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->alloc_migratetype == __entry->fallback_migratetype)
);
+TRACE_EVENT(hoad,
+ TP_PROTO(int order),
+ TP_ARGS(order),
+ TP_STRUCT__entry(
+ __field(int, order)
+ ),
+ TP_fast_assign(
+ __entry->order = order;
+ ),
+ TP_printk("order=%d", __entry->order)
+);
+
#endif /* _TRACE_KMEM_H */
/* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1ae193b26a1d..2a0d459bbaa3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3533,6 +3533,261 @@ static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
}
}
+struct hoad_order_info {
+ unsigned long interval;
+ int max_allocs;
+ atomic_t counter;
+ unsigned long since_jiffies;
+ struct delayed_work reset_counter_work;
+};
+
+static struct hoad_order_info *hoad_table[MAX_ORDER];
+static DEFINE_MUTEX(hoad_mutex);
+static struct kobject *hoad_kobj;
+static int hoad_uevent_order;
+static unsigned long hoad_resume_jiffies;
+static int hoad_trace_min_order;
+
+static void hoad_reset_counter(struct work_struct *work)
+{
+ struct hoad_order_info *hoi = container_of(work,
+ struct hoad_order_info, reset_counter_work.work);
+
+ atomic_set(&hoi->counter, 0);
+}
+
+static void hoad_send_uevent(struct work_struct *work)
+{
+ char order_string[16];
+ char *envp[] = { order_string, NULL };
+ sprintf(order_string, "ORDER=%d", hoad_uevent_order);
+ kobject_uevent_env(hoad_kobj, KOBJ_CHANGE, envp);
+}
+static DECLARE_WORK(hoad_send_uevent_work, hoad_send_uevent);
+
+static void hoad_resume(struct work_struct *work)
+{
+ hoad_uevent_order = 0;
+}
+static DECLARE_DELAYED_WORK(hoad_resume_work, hoad_resume);
+
+static void hoad_notice_alloc(int order, gfp_t gfp)
+{
+ struct hoad_order_info *hoi;
+ int count;
+ bool hit = false;
+
+ if (gfp & (__GFP_NORETRY | __GFP_ORDER_NOWARN))
+ return;
+
+ if (order >= hoad_trace_min_order)
+ trace_hoad(order);
+
+ rcu_read_lock();
+ hoi = rcu_dereference(hoad_table[order]);
+ if (hoi) {
+ count = atomic_inc_return(&hoi->counter);
+ if (count == 1) {
+ hoi->since_jiffies = jiffies;
+ schedule_delayed_work(&hoi->reset_counter_work,
+ hoi->interval);
+ }
+ hit = (count == hoi->max_allocs);
+ }
+ rcu_read_unlock();
+
+ if (hit) {
+ if (cmpxchg(&hoad_uevent_order, 0, order) == 0)
+ schedule_work(&hoad_send_uevent_work);
+ }
+}
+
+static void hoad_install_order_info(int order, struct hoad_order_info *hoi)
+{
+ struct hoad_order_info *oldhoi;
+ int i;
+
+ mutex_lock(&hoad_mutex);
+ oldhoi = hoad_table[order];
+ rcu_assign_pointer(hoad_table[order], hoi);
+ for (i = 1; i < MAX_ORDER; i++) {
+ if (hoad_table[i])
+ break;
+ }
+ hoad_trace_min_order = i;
+ mutex_unlock(&hoad_mutex);
+
+ if (oldhoi) {
+ synchronize_rcu();
+ cancel_delayed_work_sync(&oldhoi->reset_counter_work);
+ kfree(oldhoi);
+ }
+}
+
+static int hoad_enable_for_order(int order, int max_allocs,
+ unsigned int interval_msecs)
+{
+ struct hoad_order_info *hoi;
+ unsigned long interval;
+
+ if (order < 1 || order >= MAX_ORDER)
+ return -EINVAL;
+ if (max_allocs < 1)
+ return -EINVAL;
+ interval = msecs_to_jiffies(interval_msecs);
+ if (interval < 1)
+ return -EINVAL;
+
+ hoi = kzalloc(sizeof(*hoi), GFP_KERNEL);
+ if (!hoi)
+ return -ENOMEM;
+ hoi->interval = interval;
+ hoi->max_allocs = max_allocs;
+ INIT_DELAYED_WORK(&hoi->reset_counter_work, hoad_reset_counter);
+
+ hoad_install_order_info(order, hoi);
+ return 0;
+}
+
+static int hoad_disable_for_order(int order)
+{
+ if (order < 1 || order >= MAX_ORDER)
+ return -EINVAL;
+
+ hoad_install_order_info(order, NULL);
+ return 0;
+}
+
+static ssize_t hoad_control_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ char *p = buf;
+ int order;
+ struct hoad_order_info *hoi;
+ int counter;
+ long d;
+ unsigned int msecs;
+
+ rcu_read_lock();
+ for (order = 1; order < MAX_ORDER; order++) {
+ hoi = rcu_dereference(hoad_table[order]);
+ if (hoi) {
+ counter = atomic_read(&hoi->counter);
+ msecs = counter ?
+ jiffies_to_msecs(jiffies - hoi->since_jiffies) :
+ 0;
+ p += sprintf(p, "order %u: %u/%u in %u/%u msecs\n",
+ order, counter, hoi->max_allocs,
+ msecs, jiffies_to_msecs(hoi->interval));
+ }
+ }
+ rcu_read_unlock();
+ if (hoad_uevent_order) {
+ p += sprintf(p, "event generation suspended");
+ d = (long)(hoad_resume_jiffies - jiffies);
+ if (d > 0) {
+ p += sprintf(p, ", resume in ");
+ msecs = jiffies_to_msecs(d);
+ if (msecs >= 1000 * 60 * 60 * 2)
+ p += sprintf(p, "%u hours",
+ msecs / (1000 * 60 * 60));
+ else if (msecs > 1000 * 60 * 2)
+ p += sprintf(p, "%u minutes",
+ msecs / (1000 * 60));
+ else
+ p += sprintf(p, "%u seconds",
+ (msecs + 999) / 1000);
+ }
+ *p++ = '\n';
+ }
+
+ return p - buf;
+}
+
+static ssize_t hoad_control_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ char *p, *q;
+ int order, max_allocs, ret;
+ unsigned int msecs;
+ unsigned long d;
+ char c;
+
+ if (len == 0)
+ return 0;
+ p = kstrdup(buf, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ q = strim(p);
+ if (*q == '\0') {
+ ret = 0;
+ goto out;
+ }
+
+ if (sscanf(q, "enable %u %u %u%c",
+ &order, &max_allocs, &msecs, &c) == 3)
+ ret = hoad_enable_for_order(order, max_allocs, msecs);
+ else if (sscanf(q, "disable %u%c", &order, &c) == 1)
+ ret = hoad_disable_for_order(order);
+ else if (sscanf(q, "resume %u%c", &msecs, &c) == 1) {
+ if (msecs > 1000 * 60 * 60 * 24 * 5)
+ ret = -EINVAL;
+ else {
+do_resume:
+ d = msecs_to_jiffies(msecs);
+ hoad_resume_jiffies = jiffies + d;
+ cancel_delayed_work_sync(&hoad_resume_work);
+ schedule_delayed_work(&hoad_resume_work, d);
+ ret = 0;
+ }
+ }
+ else if (!strcmp(q, "resume")) {
+ msecs = 0;
+ goto do_resume;
+ } else
+ ret = -EINVAL;
+
+out:
+ kfree(p);
+ return ret ? ret : len;
+}
+
+static struct kobj_attribute hoad_control_attr = {
+ .attr.name = "control",
+ .attr.mode = S_IRUSR | S_IWUSR,
+ .show = hoad_control_show,
+ .store = hoad_control_store,
+};
+
+static int hoad_init(void)
+{
+ int ret;
+
+ /* To be able to generate uevents, need a kobject with kset defined.
+ *
+ * To avoid extra depth inside sysfs, create a kset and use it's
+ * internal kobject, by setting it's 'kset' field to itself.
+ */
+ struct kset *kset = kset = kset_create_and_add("hoad", NULL, mm_kobj);
+ if (!kset)
+ return -ENOMEM;
+ hoad_kobj = &kset->kobj;
+ hoad_kobj->kset = kset;
+
+ ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
+ if (ret) {
+ hoad_kobj->kset = NULL;
+ hoad_kobj = NULL;
+ kset_put(kset);
+ return ret;
+ }
+
+ hoad_trace_min_order = MAX_ORDER;
+ hoad_resume_jiffies = jiffies;
+ return 0;
+}
+late_initcall(hoad_init);
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -3557,6 +3812,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
!(current->flags & PF_MEMALLOC));
warn_high_order(order, gfp_mask);
+ if (order > 0)
+ hoad_notice_alloc(order, gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
--
2.20.1
More information about the Devel
mailing list