[Devel] [PATCH RH7 v2] mm: high order allocation detector

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Wed Sep 1 14:36:27 MSK 2021


High order allocation detector monitors allocations of order greater
than zero, and generates an uevent if a configured number of allocations
happen within configured time.

In return to this uevent, userspace can enable event tracing. If a
stream of high-order allocations continues, the trace could help to
detect the code path causing them.

HOAD has a sysfs control interface, at /sys/kernel/mm/hoad/control:
- "enable ORDER COUNT MSECS"
  Sets up monitoring allocations of order ORDER: if COUNT such
  allocations are detected within MSECS, uevent is sent. Then further
  uevents is suspended, to avoid userspace races.
- "disable ORDER"
  Stops monitoring allocations of order ORDER.
- "resume [delay-msecs]"
  Allow sending a new uevent, either immediately or after the given
  delay.

The uevent is generated with ACTION="change", SUBSYSTEM="hoad", ORDER
set to the order of the allocation that has caused the uevent.

Also HOAD provides a tracepoint named "hoad", under kmem/ group, that
could be used for tracing. This tracepoint hits on every allocation of
order greater or equal to minimal order for which monitoring is enabled.

https://jira.sw.ru/browse/PSBM-92088
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/trace/events/kmem.h |  12 ++
 mm/page_alloc.c             | 266 ++++++++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 9cb647609df3..b425c6856bfd 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -305,6 +305,18 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->alloc_migratetype == __entry->fallback_migratetype)
 );
 
+TRACE_EVENT(hoad,
+	TP_PROTO(int order),
+	TP_ARGS(order),
+	TP_STRUCT__entry(
+		__field(int, order)
+	),
+	TP_fast_assign(
+		__entry->order = order;
+	),
+	TP_printk("order=%d", __entry->order)
+);
+
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1ae193b26a1d..959b1bfbafef 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3533,6 +3533,270 @@ static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
 	}
 }
 
+struct hoad_order_info {
+	unsigned long interval;
+	int max_allocs;
+	atomic_t counter;
+	unsigned long since_jiffies;
+	struct timer_list reset_counter_timer;
+};
+
+static struct hoad_order_info *hoad_table[MAX_ORDER];
+static DEFINE_MUTEX(hoad_mutex);
+static struct kobject *hoad_kobj;
+static int hoad_uevent_order;
+static unsigned long hoad_resume_jiffies;
+static int hoad_trace_min_order;
+
+#define MSEC_PER_MINUTE		(60 * MSEC_PER_SEC)
+#define MSEC_PER_HOUR		(60 * MSEC_PER_MINUTE)
+#define MSEC_PER_DAY		(60 * MSEC_PER_HOUR)
+
+static void hoad_reset_counter(struct timer_list *timer)
+{
+	struct hoad_order_info *hoi = container_of(timer,
+			struct hoad_order_info, reset_counter_timer);
+
+	atomic_set(&hoi->counter, 0);
+}
+
+static void hoad_send_uevent(struct work_struct *work)
+{
+	char order_string[16];
+	char *envp[] = { order_string, NULL };
+
+	sprintf(order_string, "ORDER=%d", hoad_uevent_order);
+	kobject_uevent_env(hoad_kobj, KOBJ_CHANGE, envp);
+}
+static DECLARE_WORK(hoad_send_uevent_work, hoad_send_uevent);
+
+static void hoad_resume(unsigned long unused)
+{
+	hoad_uevent_order = 0;
+}
+static DEFINE_TIMER(hoad_resume_timer, hoad_resume, 0, 0);
+
+static void hoad_notice_alloc(int order, gfp_t gfp)
+{
+	struct hoad_order_info *hoi;
+	int count;
+	bool hit = false;
+
+	if (gfp & (__GFP_NORETRY | __GFP_ORDER_NOWARN))
+		return;
+
+	if (order >= hoad_trace_min_order)
+		trace_hoad(order);
+
+	rcu_read_lock();
+	hoi = rcu_dereference(hoad_table[order]);
+	if (hoi) {
+		count = atomic_inc_return(&hoi->counter);
+		if (count == 1) {
+			hoi->since_jiffies = jiffies;
+			mod_timer(&hoi->reset_counter_timer,
+					hoi->since_jiffies + hoi->interval);
+		}
+		hit = (count == hoi->max_allocs);
+	}
+	rcu_read_unlock();
+
+	if (hit) {
+		if (cmpxchg(&hoad_uevent_order, 0, order) == 0)
+			schedule_work(&hoad_send_uevent_work);
+	}
+}
+
+static void hoad_install_order_info(int order, struct hoad_order_info *hoi)
+{
+	struct hoad_order_info *oldhoi;
+	int i;
+
+	mutex_lock(&hoad_mutex);
+	oldhoi = hoad_table[order];
+	rcu_assign_pointer(hoad_table[order], hoi);
+	for (i = 1; i < MAX_ORDER; i++) {
+		if (hoad_table[i])
+			break;
+	}
+	hoad_trace_min_order = i;
+	mutex_unlock(&hoad_mutex);
+
+	if (oldhoi) {
+		synchronize_rcu();
+		del_timer_sync(&oldhoi->reset_counter_timer);
+		kfree(oldhoi);
+	}
+}
+
+static int hoad_enable_for_order(int order, int max_allocs,
+		unsigned int interval_msecs)
+{
+	struct hoad_order_info *hoi;
+	unsigned long interval;
+
+	if (order < 1 || order >= MAX_ORDER)
+		return -EINVAL;
+	if (max_allocs < 1)
+		return -EINVAL;
+	interval = msecs_to_jiffies(interval_msecs);
+	if (interval < 1)
+		return -EINVAL;
+
+	hoi = kzalloc(sizeof(*hoi), GFP_KERNEL);
+	if (!hoi)
+		return -ENOMEM;
+	hoi->interval = interval;
+	hoi->max_allocs = max_allocs;
+	timer_setup(&hoi->reset_counter_timer, hoad_reset_counter, 0);
+
+	hoad_install_order_info(order, hoi);
+	return 0;
+}
+
+static int hoad_disable_for_order(int order)
+{
+	if (order < 1 || order >= MAX_ORDER)
+		return -EINVAL;
+
+	hoad_install_order_info(order, NULL);
+	return 0;
+}
+
+static ssize_t hoad_control_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	char *p = buf, *endp = &p[PAGE_SIZE - 1];
+	int order;
+	struct hoad_order_info *hoi;
+	int counter;
+	long d;
+	unsigned int msecs;
+
+	rcu_read_lock();
+	for (order = 1; order < MAX_ORDER; order++) {
+		hoi = rcu_dereference(hoad_table[order]);
+		if (hoi) {
+			counter = atomic_read(&hoi->counter);
+			msecs = counter ?
+				jiffies_to_msecs(jiffies - hoi->since_jiffies) :
+				0;
+			p += snprintf(p, endp - p,
+					"order %u: %u/%u in %u/%u msecs\n",
+					order, counter, hoi->max_allocs,
+					msecs, jiffies_to_msecs(hoi->interval));
+		}
+	}
+	rcu_read_unlock();
+	if (hoad_uevent_order) {
+		p += snprintf(p, endp - p, "event generation suspended");
+		d = (long)(hoad_resume_jiffies - jiffies);
+		if (d > 0) {
+			p += snprintf(p, endp - p, ", resume in ");
+			msecs = jiffies_to_msecs(d);
+			if (msecs >= 2 * MSEC_PER_HOUR)
+				p += snprintf(p, endp - p, "%lu hours",
+					(msecs + (MSEC_PER_HOUR / 2)) /
+						MSEC_PER_HOUR);
+			else if (msecs > 2 * MSEC_PER_MINUTE)
+				p += snprintf(p, endp - p, "%lu minutes",
+					(msecs + (MSEC_PER_MINUTE) / 2) /
+						MSEC_PER_MINUTE);
+			else
+				p += snprintf(p, endp - p, "%lu seconds",
+					(msecs + MSEC_PER_SEC - 1) /
+						MSEC_PER_SEC);
+		}
+		p += snprintf(p, endp - p, "\n");
+	}
+
+	return p - buf;
+}
+
+static ssize_t hoad_control_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t len)
+{
+	char *p, *q;
+	int order, max_allocs, ret;
+	unsigned int msecs;
+	unsigned long d;
+	char c;
+
+	if (len == 0)
+		return 0;
+	p = kstrdup(buf, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	q = strim(p);
+	if (*q == '\0') {
+		ret = 0;
+		goto out;
+	}
+
+	if (sscanf(q, "enable %u %u %u%c",
+				&order, &max_allocs, &msecs, &c) == 3)
+		ret = hoad_enable_for_order(order, max_allocs, msecs);
+	else if (sscanf(q, "disable %u%c", &order, &c) == 1)
+		ret = hoad_disable_for_order(order);
+	else if (sscanf(q, "resume %u%c", &msecs, &c) == 1) {
+		if (msecs > 5 * MSEC_PER_DAY)
+			ret = -EINVAL;
+		else {
+do_resume:
+			d = msecs_to_jiffies(msecs);
+			hoad_resume_jiffies = jiffies + d;
+			mod_timer(&hoad_resume_timer, hoad_resume_jiffies);
+			ret = 0;
+		}
+	} else if (!strcmp(q, "resume")) {
+		msecs = 0;
+		goto do_resume;
+	} else {
+		ret = -EINVAL;
+	}
+
+out:
+	kfree(p);
+	return ret ? ret : len;
+}
+
+static struct kobj_attribute hoad_control_attr = {
+	.attr.name = "control",
+	.attr.mode = S_IRUSR | S_IWUSR,
+	.show = hoad_control_show,
+	.store = hoad_control_store,
+};
+
+static int hoad_init(void)
+{
+	struct kset *kset;
+	int ret;
+
+	/* To be able to generate uevents, need a kobject with kset defined.
+	 *
+	 * To avoid extra depth inside sysfs, create a kset and use it's
+	 * internal kobject, by setting it's 'kset' field to itself.
+	 */
+	kset = kset_create_and_add("hoad", NULL, mm_kobj);
+	if (!kset)
+		return -ENOMEM;
+	hoad_kobj = &kset->kobj;
+	hoad_kobj->kset = kset;
+
+	ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
+	if (ret) {
+		hoad_kobj->kset = NULL;
+		hoad_kobj = NULL;
+		kset_put(kset);
+		return ret;
+	}
+
+	hoad_trace_min_order = MAX_ORDER;
+	hoad_resume_jiffies = jiffies;
+	return 0;
+}
+late_initcall(hoad_init);
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -3557,6 +3821,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		!(current->flags & PF_MEMALLOC));
 
 	warn_high_order(order, gfp_mask);
+	if (order > 0)
+		hoad_notice_alloc(order, gfp_mask);
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
-- 
2.20.1



More information about the Devel mailing list