[Devel] [PATCH] mm: high order allocation detector

Kirill Tkhai ktkhai at virtuozzo.com
Tue Aug 31 16:54:41 MSK 2021


On 31.08.2021 14:28, Nikita Yushchenko wrote:
> High order allocation detector monitors allocations of order greater
> than zero, and generates an uevent if a configured number of allocations
> happen within configured time.
> 
> In return to this uevent, userspace can enable event tracing. If a
> stream of high-order allocations continues, the trace could help to
> detect the code path causing them.
> 
> HOAD has a sysfs control interface, at /sys/kernel/mm/hoad/control:
> - "enable ORDER COUNT MSECS"
>   Sets up monitoring allocations of order ORDER: if COUNT such
>   allocations are detected within MSECS, uevent is sent. Then further
>   uevents is suspended, to avoid userspace races.
> - "disable ORDER"
>   Stops monitoring allocations of order ORDER.
> - "resume [delay-msecs]"
>   Allow sending a new uevent, either immediately or after the given
>   delay.
> 
> The uevent is generated with ACTION="change", SUBSYSTEM="hoad", ORDER
> set to thr order of the allocation that has caused the uevent.
> 
> Also HOAD provides a tracepoint named "hoad", under kmem/ group, that
> could be used for tracing. This tracepoint hits on every allocation of
> order greater or equal to minimal order for which monitoring is enabled.
> 
> https://jira.sw.ru/browse/PSBM-92088
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
> ---
>  include/trace/events/kmem.h |  12 ++
>  mm/page_alloc.c             | 257 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 269 insertions(+)
> 
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index 9cb647609df3..b425c6856bfd 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -305,6 +305,18 @@ TRACE_EVENT(mm_page_alloc_extfrag,
>  		__entry->alloc_migratetype == __entry->fallback_migratetype)
>  );
>  
> +TRACE_EVENT(hoad,
> +	TP_PROTO(int order),
> +	TP_ARGS(order),
> +	TP_STRUCT__entry(
> +		__field(int, order)
> +	),
> +	TP_fast_assign(
> +		__entry->order = order;
> +	),
> +	TP_printk("order=%d", __entry->order)
> +);
> +
>  #endif /* _TRACE_KMEM_H */
>  
>  /* This part must be outside protection */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1ae193b26a1d..2a0d459bbaa3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3533,6 +3533,261 @@ static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
>  	}
>  }
>  
> +struct hoad_order_info {
> +	unsigned long interval;
> +	int max_allocs;
> +	atomic_t counter;
> +	unsigned long since_jiffies;
> +	struct delayed_work reset_counter_work;
> +};
> +
> +static struct hoad_order_info *hoad_table[MAX_ORDER];
> +static DEFINE_MUTEX(hoad_mutex);
> +static struct kobject *hoad_kobj;
> +static int hoad_uevent_order;
> +static unsigned long hoad_resume_jiffies;
> +static int hoad_trace_min_order;
> +
> +static void hoad_reset_counter(struct work_struct *work)
> +{
> +	struct hoad_order_info *hoi = container_of(work,
> +			struct hoad_order_info, reset_counter_work.work);
> +
> +	atomic_set(&hoi->counter, 0);

Do we really need work to do such simple action? Why plain timer is not enough?

> +}
> +
> +static void hoad_send_uevent(struct work_struct *work)
> +{
> +	char order_string[16];
> +	char *envp[] = { order_string, NULL };

Please, use new lines after declarations.

> +	sprintf(order_string, "ORDER=%d", hoad_uevent_order);
> +	kobject_uevent_env(hoad_kobj, KOBJ_CHANGE, envp);
> +}
> +static DECLARE_WORK(hoad_send_uevent_work, hoad_send_uevent);
> +
> +static void hoad_resume(struct work_struct *work)
> +{
> +	hoad_uevent_order = 0;

The same is here: do we really need work here?

> +}
> +static DECLARE_DELAYED_WORK(hoad_resume_work, hoad_resume);
> +
> +static void hoad_notice_alloc(int order, gfp_t gfp)
> +{
> +	struct hoad_order_info *hoi;
> +	int count;
> +	bool hit = false;
> +
> +	if (gfp & (__GFP_NORETRY | __GFP_ORDER_NOWARN))
> +		return;
> +
> +	if (order >= hoad_trace_min_order)
> +		trace_hoad(order);
> +
> +	rcu_read_lock();
> +	hoi = rcu_dereference(hoad_table[order]);
> +	if (hoi) {
> +		count = atomic_inc_return(&hoi->counter);
> +		if (count == 1) {
> +			hoi->since_jiffies = jiffies;
> +			schedule_delayed_work(&hoi->reset_counter_work,
> +					hoi->interval);
> +		}
> +		hit = (count == hoi->max_allocs);
> +	}
> +	rcu_read_unlock();
> +
> +	if (hit) {
> +		if (cmpxchg(&hoad_uevent_order, 0, order) == 0)
> +			schedule_work(&hoad_send_uevent_work);
> +	}
> +}
> +
> +static void hoad_install_order_info(int order, struct hoad_order_info *hoi)
> +{
> +	struct hoad_order_info *oldhoi;
> +	int i;
> +
> +	mutex_lock(&hoad_mutex);
> +	oldhoi = hoad_table[order];
> +	rcu_assign_pointer(hoad_table[order], hoi);
> +	for (i = 1; i < MAX_ORDER; i++) {
> +		if (hoad_table[i])
> +			break;
> +	}
> +	hoad_trace_min_order = i;
> +	mutex_unlock(&hoad_mutex);
> +
> +	if (oldhoi) {
> +		synchronize_rcu();
> +		cancel_delayed_work_sync(&oldhoi->reset_counter_work);

Why do we completely cancel this? We possible updated single order entity,
don't other orders still need to be reset?

> +		kfree(oldhoi);
> +	}
> +}
> +
> +static int hoad_enable_for_order(int order, int max_allocs,
> +		unsigned int interval_msecs)
> +{
> +	struct hoad_order_info *hoi;
> +	unsigned long interval;
> +
> +	if (order < 1 || order >= MAX_ORDER)
> +		return -EINVAL;
> +	if (max_allocs < 1)
> +		return -EINVAL;
> +	interval = msecs_to_jiffies(interval_msecs);
> +	if (interval < 1)
> +		return -EINVAL;
> +
> +	hoi = kzalloc(sizeof(*hoi), GFP_KERNEL);
> +	if (!hoi)
> +		return -ENOMEM;
> +	hoi->interval = interval;
> +	hoi->max_allocs = max_allocs;
> +	INIT_DELAYED_WORK(&hoi->reset_counter_work, hoad_reset_counter);
> +
> +	hoad_install_order_info(order, hoi);
> +	return 0;
> +}
> +
> +static int hoad_disable_for_order(int order)
> +{
> +	if (order < 1 || order >= MAX_ORDER)
> +		return -EINVAL;
> +
> +	hoad_install_order_info(order, NULL);
> +	return 0;
> +}
> +
> +static ssize_t hoad_control_show(struct kobject *kobj,
> +		struct kobj_attribute *attr, char *buf)
> +{
> +	char *p = buf;
> +	int order;
> +	struct hoad_order_info *hoi;
> +	int counter;
> +	long d;
> +	unsigned int msecs;
> +
> +	rcu_read_lock();
> +	for (order = 1; order < MAX_ORDER; order++) {
> +		hoi = rcu_dereference(hoad_table[order]);
> +		if (hoi) {
> +			counter = atomic_read(&hoi->counter);
> +			msecs = counter ?
> +				jiffies_to_msecs(jiffies - hoi->since_jiffies) :
> +				0;
> +			p += sprintf(p, "order %u: %u/%u in %u/%u msecs\n",
> +					order, counter, hoi->max_allocs,
> +					msecs, jiffies_to_msecs(hoi->interval));

Any guarantees all printed in this function data fits provided @buf?

> +		}
> +	}
> +	rcu_read_unlock();
> +	if (hoad_uevent_order) {
> +		p += sprintf(p, "event generation suspended");
> +		d = (long)(hoad_resume_jiffies - jiffies);
> +		if (d > 0) {
> +			p += sprintf(p, ", resume in ");
> +			msecs = jiffies_to_msecs(d);
> +			if (msecs >= 1000 * 60 * 60 * 2)
> +				p += sprintf(p, "%u hours",
> +						msecs / (1000 * 60 * 60));
> +			else if (msecs > 1000 * 60 * 2)
> +				p += sprintf(p, "%u minutes",
> +						msecs / (1000 * 60));
> +			else
> +				p += sprintf(p, "%u seconds",
> +						(msecs + 999) / 1000);
> +		}
> +		*p++ = '\n';
> +	}
> +
> +	return p - buf;
> +}
> +
> +static ssize_t hoad_control_store(struct kobject *kobj,
> +		struct kobj_attribute *attr, const char *buf, size_t len)
> +{
> +	char *p, *q;
> +	int order, max_allocs, ret;
> +	unsigned int msecs;
> +	unsigned long d;
> +	char c;
> +
> +	if (len == 0)
> +		return 0;
> +	p = kstrdup(buf, GFP_KERNEL);
> +	if (!p)
> +		return -ENOMEM;
> +	q = strim(p);
> +	if (*q == '\0') {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (sscanf(q, "enable %u %u %u%c",
> +				&order, &max_allocs, &msecs, &c) == 3)
> +		ret = hoad_enable_for_order(order, max_allocs, msecs);
> +	else if (sscanf(q, "disable %u%c", &order, &c) == 1)
> +		ret = hoad_disable_for_order(order);
> +	else if (sscanf(q, "resume %u%c", &msecs, &c) == 1) {
> +		if (msecs > 1000 * 60 * 60 * 24 * 5)

Every magic number should be described with #define.

> +			ret = -EINVAL;
> +		else {
> +do_resume:
> +			d = msecs_to_jiffies(msecs);
> +			hoad_resume_jiffies = jiffies + d;
> +			cancel_delayed_work_sync(&hoad_resume_work);
> +			schedule_delayed_work(&hoad_resume_work, d);

It looks like we may use mod_delayed_work(system_wq, ...) here instead of two calls above.

> +			ret = 0;
> +		}
> +	}
> +	else if (!strcmp(q, "resume")) {

Please, move this "else if" up to the same line with "}" bracket

> +		msecs = 0;
> +		goto do_resume;
> +	} else
> +		ret = -EINVAL;

In case of you have "{}" brackets in a single branch of if-else, then every if, else if, else
branch has to have "{}" brackets even if it's one liner.

> +
> +out:
> +	kfree(p);
> +	return ret ? ret : len;
> +}
> +
> +static struct kobj_attribute hoad_control_attr = {
> +	.attr.name = "control",
> +	.attr.mode = S_IRUSR | S_IWUSR,
> +	.show = hoad_control_show,
> +	.store = hoad_control_store,
> +};
> +
> +static int hoad_init(void)
> +{
> +	int ret;
> +
> +	/* To be able to generate uevents, need a kobject with kset defined.
> +	 *
> +	 * To avoid extra depth inside sysfs, create a kset and use it's
> +	 * internal kobject, by setting it's 'kset' field to itself.
> +	 */
> +	struct kset *kset = kset = kset_create_and_add("hoad", NULL, mm_kobj);

1)kset = kset is here.
2)New line is needed after declaration.

> +	if (!kset)
> +		return -ENOMEM;
> +	hoad_kobj = &kset->kobj;
> +	hoad_kobj->kset = kset;
> +
> +	ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
> +	if (ret) {
> +		hoad_kobj->kset = NULL;
> +		hoad_kobj = NULL;
> +		kset_put(kset);
> +		return ret;
> +	}

Why not:

	ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
	if (ret) {
		kset_put(kset);
		return ret;
	}

	hoad_kobj = &kset->kobj;
	hoad_kobj->kset = kset;

?

Also, as I see in kernel code, the pair bracket for kset_create_and_add() is kset_unregister().
Why we use kset_put() instead?

> +
> +	hoad_trace_min_order = MAX_ORDER;
> +	hoad_resume_jiffies = jiffies;
> +	return 0;
> +}
> +late_initcall(hoad_init);
> +
>  /*
>   * This is the 'heart' of the zoned buddy allocator.
>   */
> @@ -3557,6 +3812,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
>  		!(current->flags & PF_MEMALLOC));
>  
>  	warn_high_order(order, gfp_mask);
> +	if (order > 0)
> +		hoad_notice_alloc(order, gfp_mask);
>  
>  	if (should_fail_alloc_page(gfp_mask, order))
>  		return NULL;
> 



More information about the Devel mailing list