[Devel] [PATCH v14 16/18] vmpressure: in-kernel notifications

Vladimir Davydov vdavydov at parallels.com
Fri Dec 20 06:36:22 PST 2013


On 12/20/2013 06:31 PM, Glauber Costa wrote:
>> I have the exact problem described above for a project I'm working on
>> and this solution seems to solve it well.
>>
>> However, I had a few issues while trying to use this interface. I'll
>> comment on them below, but please take this more as advice seeking
>> than patch review.
>>
>>> This patch extends that to also support in-kernel users. Events that
>>> should be generated for in-kernel consumption will be marked as such,
>>> and for those, we will call a registered function instead of triggering
>>> an eventfd notification.
>>>
>>> Please note that due to my lack of understanding of each shrinker user,
>>> I will stay away from converting the actual users, you are all welcome
>>> to do so.
>>>
>>> Signed-off-by: Glauber Costa <glommer at openvz.org>
>>> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
>>> Acked-by: Anton Vorontsov <anton at enomsg.org>
>>> Acked-by: Pekka Enberg <penberg at kernel.org>
>>> Reviewed-by: Greg Thelen <gthelen at google.com>
>>> Cc: Dave Chinner <dchinner at redhat.com>
>>> Cc: John Stultz <john.stultz at linaro.org>
>>> Cc: Andrew Morton <akpm at linux-foundation.org>
>>> Cc: Joonsoo Kim <iamjoonsoo.kim at lge.com>
>>> Cc: Michal Hocko <mhocko at suse.cz>
>>> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com>
>>> Cc: Johannes Weiner <hannes at cmpxchg.org>
>>> ---
>>>  include/linux/vmpressure.h |    5 +++++
>>>  mm/vmpressure.c            |   53 +++++++++++++++++++++++++++++++++++++++++---
>>>  2 files changed, 55 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
>>> index 3f3788d..9102e53 100644
>>> --- a/include/linux/vmpressure.h
>>> +++ b/include/linux/vmpressure.h
>>> @@ -19,6 +19,9 @@ struct vmpressure {
>>>       /* Have to grab the lock on events traversal or modifications. */
>>>       struct mutex events_lock;
>>>
>>> +     /* False if only kernel users want to be notified, true otherwise. */
>>> +     bool notify_userspace;
>>> +
>>>       struct work_struct work;
>>>  };
>>>
>>> @@ -38,6 +41,8 @@ extern int vmpressure_register_event(struct cgroup_subsys_state *css,
>>>                                    struct cftype *cft,
>>>                                    struct eventfd_ctx *eventfd,
>>>                                    const char *args);
>>> +extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>>> +                                         void (*fn)(void));
>>>  extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
>>>                                       struct cftype *cft,
>>>                                       struct eventfd_ctx *eventfd);
>>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>>> index e0f6283..730e7c1 100644
>>> --- a/mm/vmpressure.c
>>> +++ b/mm/vmpressure.c
>>> @@ -130,8 +130,12 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
>>>  }
>>>
>>>  struct vmpressure_event {
>>> -     struct eventfd_ctx *efd;
>>> +     union {
>>> +             struct eventfd_ctx *efd;
>>> +             void (*fn)(void);
>> How does the callback access its private data?
>>
>>> +     };
>>>       enum vmpressure_levels level;
>>> +     bool kernel_event;
>>>       struct list_head node;
>>>  };
>>>
>>> @@ -147,12 +151,15 @@ static bool vmpressure_event(struct vmpressure *vmpr,
>>>       mutex_lock(&vmpr->events_lock);
>>>
>>>       list_for_each_entry(ev, &vmpr->events, node) {
>>> -             if (level >= ev->level) {
>>> +             if (ev->kernel_event) {
>>> +                     ev->fn();
>> I think it would be interesting to pass 'level' to the callback (I'll
>> probably use it myself), but we could wait for a in-tree user before
>> adding it.
>>
>>> +             } else if (vmpr->notify_userspace && level >= ev->level) {
>>>                       eventfd_signal(ev->efd, 1);
>>>                       signalled = true;
>>>               }
>>>       }
>>>
>>> +     vmpr->notify_userspace = false;
>>>       mutex_unlock(&vmpr->events_lock);
>>>
>>>       return signalled;
>>> @@ -222,7 +229,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>>>        * we account it too.
>>>        */
>>>       if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
>>> -             return;
>>> +             goto schedule;
>>>
>>>       /*
>>>        * If we got here with no pages scanned, then that is an indicator
>>> @@ -239,8 +246,15 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>>>       vmpr->scanned += scanned;
>>>       vmpr->reclaimed += reclaimed;
>>>       scanned = vmpr->scanned;
>>> +     /*
>>> +      * If we didn't reach this point, only kernel events will be triggered.
>>> +      * It is the job of the worker thread to clean this up once the
>>> +      * notifications are all delivered.
>>> +      */
>>> +     vmpr->notify_userspace = true;
>>>       spin_unlock(&vmpr->sr_lock);
>>>
>>> +schedule:
>>>       if (scanned < vmpressure_win)
>>>               return;
>>>       schedule_work(&vmpr->work);
>>> @@ -324,6 +338,39 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
>>>  }
>>>
>>>  /**
>>> + * vmpressure_register_kernel_event() - Register kernel-side notification
>>> + * @css:     css that is interested in vmpressure notifications
>>> + * @fn:              function to be called when pressure happens
>>> + *
>>> + * This function register in-kernel users interested in receiving notifications
>>> + * about pressure conditions. Pressure notifications will be triggered at the
>>> + * same time as userspace notifications (with no particular ordering relative
>>> + * to it).
>>> + *
>>> + * Pressure notifications are a alternative method to shrinkers and will serve
>>> + * well users that are interested in a one-shot notification, with a
>>> + * well-defined cgroup aware interface.
>>> + */
>>> +int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>>> +                                   void (*fn)(void))
>>> +{
>>> +     struct vmpressure *vmpr = css_to_vmpressure(css);
>> This doesn't allow for css=NULL. What's the recommended way for a today's
>> shrinker (which is not related to cgroups) to register with this API?
>>
>> Also, you don't seem to provide a way to de-register from the event.
>>
> The answer for all of your questions above can be summarized by noting
> that for the lack of other users (at the time), this patch does the bare minimum
> for memcg needs. I agree, for instance, that it would be good to pass the level
> but since memcg won't do anything with thta, I didn't pass it.
>
> That should be extended if you need to.
>
>> I hacked a patch to be able to use this, seems to work but it's a ugly
>> hack:
>>
>> ---
>>  include/linux/vmpressure.h |  3 ++-
>>  mm/vmpressure.c            | 13 +++++++++----
>>  2 files changed, 11 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
>> index 9102e53..de416b6 100644
>> --- a/include/linux/vmpressure.h
>> +++ b/include/linux/vmpressure.h
>> @@ -42,7 +42,8 @@ extern int vmpressure_register_event(struct cgroup_subsys_state *css,
>>                                      struct eventfd_ctx *eventfd,
>>                                      const char *args);
>>  extern int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>> -                                           void (*fn)(void));
>> +                                           void (*fn)(void *data, int level),
>> +                                           void *data);
>>  extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
>>                                         struct cftype *cft,
>>                                         struct eventfd_ctx *eventfd);
>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>> index 730e7c1..4ed0e85 100644
>> --- a/mm/vmpressure.c
>> +++ b/mm/vmpressure.c
>> @@ -132,9 +132,10 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
>>  struct vmpressure_event {
>>         union {
>>                 struct eventfd_ctx *efd;
>> -               void (*fn)(void);
>> +               void (*fn)(void *data, int level);
>>         };
>>         enum vmpressure_levels level;
>> +       void *data;
>>         bool kernel_event;
>>         struct list_head node;
>>  };
>> @@ -152,7 +153,7 @@ static bool vmpressure_event(struct vmpressure *vmpr,
>>
>>         list_for_each_entry(ev, &vmpr->events, node) {
>>                 if (ev->kernel_event) {
>> -                       ev->fn();
>> +                       ev->fn(ev->data, level);
>>                 } else if (vmpr->notify_userspace && level >= ev->level) {
>>                         eventfd_signal(ev->efd, 1);
>>                         signalled = true;
>> @@ -352,21 +353,25 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
>>   * well-defined cgroup aware interface.
>>   */
>>  int vmpressure_register_kernel_event(struct cgroup_subsys_state *css,
>> -                                     void (*fn)(void))
>> +                                    void (*fn)(void *data, int level), void *data)
>>  {
>> -       struct vmpressure *vmpr = css_to_vmpressure(css);
>> +       struct vmpressure *vmpr;
>>         struct vmpressure_event *ev;
>>
>> +       vmpr = css ? css_to_vmpressure(css) : memcg_to_vmpressure(NULL);
>> +
>>         ev = kzalloc(sizeof(*ev), GFP_KERNEL);
>>         if (!ev)
>>                 return -ENOMEM;
>>
>>         ev->kernel_event = true;
>> +       ev->data = data;
>>         ev->fn = fn;
>>
>>         mutex_lock(&vmpr->events_lock);
>>         list_add(&ev->node, &vmpr->events);
>>         mutex_unlock(&vmpr->events_lock);
>> +
> Your patch makes sense.

I guess I'll include this to the next iteration of this patch-set then.

Thanks.



More information about the Devel mailing list