[Devel] Re: [PATCH 3/7] per-cgroup slab caches
KAMEZAWA Hiroyuki
kamezawa.hiroyu at jp.fujitsu.com
Tue Feb 21 17:21:38 PST 2012
On Tue, 21 Feb 2012 15:34:35 +0400
Glauber Costa <glommer at parallels.com> wrote:
> This patch creates the infrastructure to allow us to register
> per-memcg slab caches. As an example implementation, I am tracking
> the dentry cache, but others will follow.
>
> I am using an opt-in istead of opt-out system here: this means the
> cache needs to explicitly register itself to be tracked by memcg.
> I prefer this approach since:
>
> 1) not all caches are big enough to be relevant,
> 2) most of the relevant ones will involve shrinking in some way,
> and it would be better be sure they are shrinker-aware
> 3) some objects, like network sockets, have their very own idea
> of memory control, that goes beyond the allocation itself.
>
> Once registered, allocations made on behalf of a task living
> on a cgroup will be billed to it. It is a first-touch mechanism,
> but if follows what we have today, and the cgroup infrastructure
> itself. No overhead is expected in object allocation: only when
> slab pages are allocated and freed, any form of billing occurs.
>
> The allocation stays billed to a cgroup until it is destroyed.
> It is kept if a task leaves the cgroup, since it is close to
> impossible to efficiently map an object to a task - and the tracking
> is done by pages, which contain multiple objects.
>
Hmm....can't we do this by
kmem_cache = kmem_cache_create(...., ...., SLAB_XXX_XXX | SLAB_MEMCG_AWARE)
kmem_cache_alloc(kmem_cache, flags)
=> find a memcg_kmem_cache for the thread. if it doesn't exist, create it.
BTW, comparing ANON/FILE caches, we'll have many kinds of gfp_t flags.
Do we handle it correctly ?
Maybe I don't fully understand your implemenatation...but try to comment.
> Signed-off-by: Glauber Costa <glommer at parallels.com>
> CC: Kirill A. Shutemov <kirill at shutemov.name>
> CC: Greg Thelen <gthelen at google.com>
> CC: Johannes Weiner <jweiner at redhat.com>
> CC: Michal Hocko <mhocko at suse.cz>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu at jp.fujitsu.com>
> CC: Paul Turner <pjt at google.com>
> CC: Frederic Weisbecker <fweisbec at gmail.com>
> CC: Pekka Enberg <penberg at kernel.org>
> CC: Christoph Lameter <cl at linux.com>
> ---
> include/linux/memcontrol.h | 29 ++++++++++++++
> include/linux/slab.h | 8 ++++
> include/linux/slub_def.h | 2 +
> mm/memcontrol.c | 93 ++++++++++++++++++++++++++++++++++++++++++++
> mm/slub.c | 68 +++++++++++++++++++++++++++++--
> 5 files changed, 195 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4d34356..95e7e19 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -21,12 +21,41 @@
> #define _LINUX_MEMCONTROL_H
> #include <linux/cgroup.h>
> #include <linux/vm_event_item.h>
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +#include <linux/shrinker.h>
>
> struct mem_cgroup;
> struct page_cgroup;
> struct page;
> struct mm_struct;
>
> +struct memcg_kmem_cache {
> + struct kmem_cache *cache;
> + struct mem_cgroup *memcg; /* Should be able to do without this */
> +};
> +
> +struct memcg_cache_struct {
> + int index;
> + struct kmem_cache *cache;
> +};
> +
> +enum memcg_cache_indexes {
> + CACHE_DENTRY,
> + NR_CACHES,
> +};
> +
> +int memcg_kmem_newpage(struct mem_cgroup *memcg, struct page *page, unsigned long pages);
> +void memcg_kmem_freepage(struct mem_cgroup *memcg, struct page *page, unsigned long pages);
> +struct mem_cgroup *memcg_from_shrinker(struct shrinker *s);
> +
> +struct memcg_kmem_cache *memcg_cache_get(struct mem_cgroup *memcg, int index);
> +void register_memcg_cache(struct memcg_cache_struct *cache);
> +void memcg_slab_destroy(struct kmem_cache *cache, struct mem_cgroup *memcg);
> +
> +struct kmem_cache *
> +kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *base);
> +
> /* Stats that can be updated by kernel. */
> enum mem_cgroup_page_stat_item {
> MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 573c809..8a372cd 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -98,6 +98,14 @@
> void __init kmem_cache_init(void);
> int slab_is_available(void);
>
> +struct mem_cgroup;
> +
> +unsigned long slab_nr_pages(struct kmem_cache *s);
> +
> +struct kmem_cache *kmem_cache_create_cg(struct mem_cgroup *memcg,
> + const char *, size_t, size_t,
> + unsigned long,
> + void (*)(void *));
> struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
> unsigned long,
> void (*)(void *));
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index a32bcfd..4f39fff 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -100,6 +100,8 @@ struct kmem_cache {
> struct kobject kobj; /* For sysfs */
> #endif
>
> + struct mem_cgroup *memcg;
> + struct kmem_cache *parent_slab; /* can be moved out of here as well */
> #ifdef CONFIG_NUMA
> /*
> * Defragmentation by allocating from a remote node.
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 26fda11..2aa35b0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -49,6 +49,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/cpu.h>
> #include <linux/oom.h>
> +#include <linux/slab.h>
> #include "internal.h"
> #include <net/sock.h>
> #include <net/tcp_memcontrol.h>
> @@ -302,8 +303,11 @@ struct mem_cgroup {
> #ifdef CONFIG_INET
> struct tcp_memcontrol tcp_mem;
> #endif
> + struct memcg_kmem_cache kmem_cache[NR_CACHES];
> };
>
> +struct memcg_cache_struct *kmem_avail_caches[NR_CACHES];
> +
What this pointer array holds ? This can be accessed without any locks ?
> /* Stuffs for move charges at task migration. */
> /*
> * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
> @@ -4980,6 +4984,93 @@ err_cleanup:
>
> }
>
> +struct memcg_kmem_cache *memcg_cache_get(struct mem_cgroup *memcg, int index)
> +{
> + return &memcg->kmem_cache[index];
> +}
> +
> +void register_memcg_cache(struct memcg_cache_struct *cache)
> +{
> + BUG_ON(kmem_avail_caches[cache->index]);
> +
> + kmem_avail_caches[cache->index] = cache;
> +}
> +
> +#define memcg_kmem(memcg) \
> + (memcg->kmem_independent_accounting ? &memcg->kmem : &memcg->res)
> +
> +struct kmem_cache *
> +kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *base)
> +{
> + struct kmem_cache *s;
> + unsigned long pages;
> + struct res_counter *fail;
> + /*
> + * TODO: We should use an ida-like index here, instead
> + * of the kernel address
> + */
> + char *kname = kasprintf(GFP_KERNEL, "%s-%p", base->name, memcg);
> +
> + WARN_ON(mem_cgroup_is_root(memcg));
> +
> + if (!kname)
> + return NULL;
> +
> + s = kmem_cache_create_cg(memcg, kname, base->size,
> + base->align, base->flags, base->ctor);
> + if (WARN_ON(!s))
> + goto out;
> +
> +
> + pages = slab_nr_pages(s);
> +
> + if (res_counter_charge(memcg_kmem(memcg), pages << PAGE_SHIFT, &fail)) {
> + kmem_cache_destroy(s);
> + s = NULL;
> + }
Why 'pages' should be charged to a new memcg ?
A newly created memcg starts with res_counter.usage != 0 ??
> +
> + mem_cgroup_get(memcg);
get even if allocation failure ? (and for what updating refcnt ?)
> +out:
> + kfree(kname);
> + return s;
> +}
> +
> +int memcg_kmem_newpage(struct mem_cgroup *memcg, struct page *page, unsigned long pages)
> +{
> + unsigned long size = pages << PAGE_SHIFT;
> + struct res_counter *fail;
> +
> + return res_counter_charge(memcg_kmem(memcg), size, &fail);
> +}
> +
> +void memcg_kmem_freepage(struct mem_cgroup *memcg, struct page *page, unsigned long pages)
> +{
> + unsigned long size = pages << PAGE_SHIFT;
> +
> + res_counter_uncharge(memcg_kmem(memcg), size);
> +}
> +
> +void memcg_create_kmem_caches(struct mem_cgroup *memcg)
> +{
> + int i;
> +
> + for (i = 0; i < NR_CACHES; i++) {
> + struct kmem_cache *cache;
> +
> + if (!kmem_avail_caches[i] || !kmem_avail_caches[i]->cache)
> + continue;
> +
> + cache = kmem_avail_caches[i]->cache;
> +
> + if (mem_cgroup_is_root(memcg))
> + memcg->kmem_cache[i].cache = cache;
> + else
> + memcg->kmem_cache[i].cache = kmem_cache_dup(memcg, cache);
> + memcg->kmem_cache[i].memcg = memcg;
> + }
> +}
Hmm... memcg should know _all_ kmem_caches when it's created. Right ?
Then, modules will not use memcg aware kmem_cache...
> +
> +
> static struct cgroup_subsys_state * __ref
> mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> {
> @@ -5039,6 +5130,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> if (parent)
> memcg->swappiness = mem_cgroup_swappiness(parent);
> atomic_set(&memcg->refcnt, 1);
> +
> + memcg_create_kmem_caches(memcg);
> memcg->move_charge_at_immigrate = 0;
> mutex_init(&memcg->thresholds_lock);
> return &memcg->css;
> diff --git a/mm/slub.c b/mm/slub.c
> index 4907563..f3815ec 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -31,6 +31,8 @@
> #include <linux/stacktrace.h>
>
> #include <trace/events/kmem.h>
> +#include <linux/cgroup.h>
> +#include <linux/memcontrol.h>
>
> /*
> * Lock order:
> @@ -1281,6 +1283,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> struct page *page;
> struct kmem_cache_order_objects oo = s->oo;
> gfp_t alloc_gfp;
> + int pages;
>
> flags &= gfp_allowed_mask;
>
> @@ -1314,9 +1317,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> if (!page)
> return NULL;
>
> + pages = 1 << oo_order(oo);
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
> + if (s->memcg && memcg_kmem_newpage(s->memcg, page, pages) < 0) {
> + __free_pages(page, oo_order(oo));
> + return NULL;
> + }
> +#endif
Hmm. no reclaim happens at allocation failure ?
Doesn't it turn to be very bad user experience ?
Thanks,
-Kame
More information about the Devel
mailing list