[Devel] [PATCH] mm: high order allocation detector
Kirill Tkhai
ktkhai at virtuozzo.com
Tue Aug 31 16:54:41 MSK 2021
On 31.08.2021 14:28, Nikita Yushchenko wrote:
> High order allocation detector monitors allocations of order greater
> than zero, and generates an uevent if a configured number of allocations
> happen within configured time.
>
> In return to this uevent, userspace can enable event tracing. If a
> stream of high-order allocations continues, the trace could help to
> detect the code path causing them.
>
> HOAD has a sysfs control interface, at /sys/kernel/mm/hoad/control:
> - "enable ORDER COUNT MSECS"
> Sets up monitoring allocations of order ORDER: if COUNT such
> allocations are detected within MSECS, uevent is sent. Then further
> uevents is suspended, to avoid userspace races.
> - "disable ORDER"
> Stops monitoring allocations of order ORDER.
> - "resume [delay-msecs]"
> Allow sending a new uevent, either immediately or after the given
> delay.
>
> The uevent is generated with ACTION="change", SUBSYSTEM="hoad", ORDER
> set to thr order of the allocation that has caused the uevent.
>
> Also HOAD provides a tracepoint named "hoad", under kmem/ group, that
> could be used for tracing. This tracepoint hits on every allocation of
> order greater or equal to minimal order for which monitoring is enabled.
>
> https://jira.sw.ru/browse/PSBM-92088
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
> ---
> include/trace/events/kmem.h | 12 ++
> mm/page_alloc.c | 257 ++++++++++++++++++++++++++++++++++++
> 2 files changed, 269 insertions(+)
>
> diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
> index 9cb647609df3..b425c6856bfd 100644
> --- a/include/trace/events/kmem.h
> +++ b/include/trace/events/kmem.h
> @@ -305,6 +305,18 @@ TRACE_EVENT(mm_page_alloc_extfrag,
> __entry->alloc_migratetype == __entry->fallback_migratetype)
> );
>
> +TRACE_EVENT(hoad,
> + TP_PROTO(int order),
> + TP_ARGS(order),
> + TP_STRUCT__entry(
> + __field(int, order)
> + ),
> + TP_fast_assign(
> + __entry->order = order;
> + ),
> + TP_printk("order=%d", __entry->order)
> +);
> +
> #endif /* _TRACE_KMEM_H */
>
> /* This part must be outside protection */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1ae193b26a1d..2a0d459bbaa3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3533,6 +3533,261 @@ static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
> }
> }
>
> +struct hoad_order_info {
> + unsigned long interval;
> + int max_allocs;
> + atomic_t counter;
> + unsigned long since_jiffies;
> + struct delayed_work reset_counter_work;
> +};
> +
> +static struct hoad_order_info *hoad_table[MAX_ORDER];
> +static DEFINE_MUTEX(hoad_mutex);
> +static struct kobject *hoad_kobj;
> +static int hoad_uevent_order;
> +static unsigned long hoad_resume_jiffies;
> +static int hoad_trace_min_order;
> +
> +static void hoad_reset_counter(struct work_struct *work)
> +{
> + struct hoad_order_info *hoi = container_of(work,
> + struct hoad_order_info, reset_counter_work.work);
> +
> + atomic_set(&hoi->counter, 0);
Do we really need work to do such simple action? Why plain timer is not enough?
> +}
> +
> +static void hoad_send_uevent(struct work_struct *work)
> +{
> + char order_string[16];
> + char *envp[] = { order_string, NULL };
Please, use new lines after declarations.
> + sprintf(order_string, "ORDER=%d", hoad_uevent_order);
> + kobject_uevent_env(hoad_kobj, KOBJ_CHANGE, envp);
> +}
> +static DECLARE_WORK(hoad_send_uevent_work, hoad_send_uevent);
> +
> +static void hoad_resume(struct work_struct *work)
> +{
> + hoad_uevent_order = 0;
The same is here: do we really need work here?
> +}
> +static DECLARE_DELAYED_WORK(hoad_resume_work, hoad_resume);
> +
> +static void hoad_notice_alloc(int order, gfp_t gfp)
> +{
> + struct hoad_order_info *hoi;
> + int count;
> + bool hit = false;
> +
> + if (gfp & (__GFP_NORETRY | __GFP_ORDER_NOWARN))
> + return;
> +
> + if (order >= hoad_trace_min_order)
> + trace_hoad(order);
> +
> + rcu_read_lock();
> + hoi = rcu_dereference(hoad_table[order]);
> + if (hoi) {
> + count = atomic_inc_return(&hoi->counter);
> + if (count == 1) {
> + hoi->since_jiffies = jiffies;
> + schedule_delayed_work(&hoi->reset_counter_work,
> + hoi->interval);
> + }
> + hit = (count == hoi->max_allocs);
> + }
> + rcu_read_unlock();
> +
> + if (hit) {
> + if (cmpxchg(&hoad_uevent_order, 0, order) == 0)
> + schedule_work(&hoad_send_uevent_work);
> + }
> +}
> +
> +static void hoad_install_order_info(int order, struct hoad_order_info *hoi)
> +{
> + struct hoad_order_info *oldhoi;
> + int i;
> +
> + mutex_lock(&hoad_mutex);
> + oldhoi = hoad_table[order];
> + rcu_assign_pointer(hoad_table[order], hoi);
> + for (i = 1; i < MAX_ORDER; i++) {
> + if (hoad_table[i])
> + break;
> + }
> + hoad_trace_min_order = i;
> + mutex_unlock(&hoad_mutex);
> +
> + if (oldhoi) {
> + synchronize_rcu();
> + cancel_delayed_work_sync(&oldhoi->reset_counter_work);
Why do we completely cancel this? We possible updated single order entity,
don't other orders still need to be reset?
> + kfree(oldhoi);
> + }
> +}
> +
> +static int hoad_enable_for_order(int order, int max_allocs,
> + unsigned int interval_msecs)
> +{
> + struct hoad_order_info *hoi;
> + unsigned long interval;
> +
> + if (order < 1 || order >= MAX_ORDER)
> + return -EINVAL;
> + if (max_allocs < 1)
> + return -EINVAL;
> + interval = msecs_to_jiffies(interval_msecs);
> + if (interval < 1)
> + return -EINVAL;
> +
> + hoi = kzalloc(sizeof(*hoi), GFP_KERNEL);
> + if (!hoi)
> + return -ENOMEM;
> + hoi->interval = interval;
> + hoi->max_allocs = max_allocs;
> + INIT_DELAYED_WORK(&hoi->reset_counter_work, hoad_reset_counter);
> +
> + hoad_install_order_info(order, hoi);
> + return 0;
> +}
> +
> +static int hoad_disable_for_order(int order)
> +{
> + if (order < 1 || order >= MAX_ORDER)
> + return -EINVAL;
> +
> + hoad_install_order_info(order, NULL);
> + return 0;
> +}
> +
> +static ssize_t hoad_control_show(struct kobject *kobj,
> + struct kobj_attribute *attr, char *buf)
> +{
> + char *p = buf;
> + int order;
> + struct hoad_order_info *hoi;
> + int counter;
> + long d;
> + unsigned int msecs;
> +
> + rcu_read_lock();
> + for (order = 1; order < MAX_ORDER; order++) {
> + hoi = rcu_dereference(hoad_table[order]);
> + if (hoi) {
> + counter = atomic_read(&hoi->counter);
> + msecs = counter ?
> + jiffies_to_msecs(jiffies - hoi->since_jiffies) :
> + 0;
> + p += sprintf(p, "order %u: %u/%u in %u/%u msecs\n",
> + order, counter, hoi->max_allocs,
> + msecs, jiffies_to_msecs(hoi->interval));
Any guarantees all printed in this function data fits provided @buf?
> + }
> + }
> + rcu_read_unlock();
> + if (hoad_uevent_order) {
> + p += sprintf(p, "event generation suspended");
> + d = (long)(hoad_resume_jiffies - jiffies);
> + if (d > 0) {
> + p += sprintf(p, ", resume in ");
> + msecs = jiffies_to_msecs(d);
> + if (msecs >= 1000 * 60 * 60 * 2)
> + p += sprintf(p, "%u hours",
> + msecs / (1000 * 60 * 60));
> + else if (msecs > 1000 * 60 * 2)
> + p += sprintf(p, "%u minutes",
> + msecs / (1000 * 60));
> + else
> + p += sprintf(p, "%u seconds",
> + (msecs + 999) / 1000);
> + }
> + *p++ = '\n';
> + }
> +
> + return p - buf;
> +}
> +
> +static ssize_t hoad_control_store(struct kobject *kobj,
> + struct kobj_attribute *attr, const char *buf, size_t len)
> +{
> + char *p, *q;
> + int order, max_allocs, ret;
> + unsigned int msecs;
> + unsigned long d;
> + char c;
> +
> + if (len == 0)
> + return 0;
> + p = kstrdup(buf, GFP_KERNEL);
> + if (!p)
> + return -ENOMEM;
> + q = strim(p);
> + if (*q == '\0') {
> + ret = 0;
> + goto out;
> + }
> +
> + if (sscanf(q, "enable %u %u %u%c",
> + &order, &max_allocs, &msecs, &c) == 3)
> + ret = hoad_enable_for_order(order, max_allocs, msecs);
> + else if (sscanf(q, "disable %u%c", &order, &c) == 1)
> + ret = hoad_disable_for_order(order);
> + else if (sscanf(q, "resume %u%c", &msecs, &c) == 1) {
> + if (msecs > 1000 * 60 * 60 * 24 * 5)
Every magic number should be described with #define.
> + ret = -EINVAL;
> + else {
> +do_resume:
> + d = msecs_to_jiffies(msecs);
> + hoad_resume_jiffies = jiffies + d;
> + cancel_delayed_work_sync(&hoad_resume_work);
> + schedule_delayed_work(&hoad_resume_work, d);
It looks like we may use mod_delayed_work(system_wq, ...) here instead of two calls above.
> + ret = 0;
> + }
> + }
> + else if (!strcmp(q, "resume")) {
Please, move this "else if" up to the same line with "}" bracket
> + msecs = 0;
> + goto do_resume;
> + } else
> + ret = -EINVAL;
In case of you have "{}" brackets in a single branch of if-else, then every if, else if, else
branch has to have "{}" brackets even if it's one liner.
> +
> +out:
> + kfree(p);
> + return ret ? ret : len;
> +}
> +
> +static struct kobj_attribute hoad_control_attr = {
> + .attr.name = "control",
> + .attr.mode = S_IRUSR | S_IWUSR,
> + .show = hoad_control_show,
> + .store = hoad_control_store,
> +};
> +
> +static int hoad_init(void)
> +{
> + int ret;
> +
> + /* To be able to generate uevents, need a kobject with kset defined.
> + *
> + * To avoid extra depth inside sysfs, create a kset and use it's
> + * internal kobject, by setting it's 'kset' field to itself.
> + */
> + struct kset *kset = kset = kset_create_and_add("hoad", NULL, mm_kobj);
1)kset = kset is here.
2)New line is needed after declaration.
> + if (!kset)
> + return -ENOMEM;
> + hoad_kobj = &kset->kobj;
> + hoad_kobj->kset = kset;
> +
> + ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
> + if (ret) {
> + hoad_kobj->kset = NULL;
> + hoad_kobj = NULL;
> + kset_put(kset);
> + return ret;
> + }
Why not:
ret = sysfs_create_file(hoad_kobj, &hoad_control_attr.attr);
if (ret) {
kset_put(kset);
return ret;
}
hoad_kobj = &kset->kobj;
hoad_kobj->kset = kset;
?
Also, as I see in kernel code, the pair bracket for kset_create_and_add() is kset_unregister().
Why we use kset_put() instead?
> +
> + hoad_trace_min_order = MAX_ORDER;
> + hoad_resume_jiffies = jiffies;
> + return 0;
> +}
> +late_initcall(hoad_init);
> +
> /*
> * This is the 'heart' of the zoned buddy allocator.
> */
> @@ -3557,6 +3812,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
> !(current->flags & PF_MEMALLOC));
>
> warn_high_order(order, gfp_mask);
> + if (order > 0)
> + hoad_notice_alloc(order, gfp_mask);
>
> if (should_fail_alloc_page(gfp_mask, order))
> return NULL;
>
More information about the Devel
mailing list