diff --git a/fs/dcache.c b/fs/dcache.c index a88948b..cd5d091 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -142,7 +142,7 @@ static void __d_free(struct rcu_head *head) WARN_ON(!list_empty(&dentry->d_alias)); if (dname_external(dentry)) kfree(dentry->d_name.name); - kmem_cache_free(dentry_cache, dentry); + kmem_cache_free(dentry->dentry_cache, dentry); } /* @@ -1160,6 +1160,8 @@ void shrink_dcache_parent(struct dentry * parent) } EXPORT_SYMBOL(shrink_dcache_parent); +static struct memcg_slab_ctlr dcache_ctlr; + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1173,9 +1175,14 @@ EXPORT_SYMBOL(shrink_dcache_parent); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; + struct kmem_cache *current_dcache; char *dname; - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + current_dcache = current_kmem_cache(dcache_ctlr.token); + if (unlikely(!current_dcache)) + current_dcache = dentry_cache; + + dentry = kmem_cache_alloc(current_dcache, GFP_KERNEL); if (!dentry) return NULL; @@ -1210,6 +1217,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_LIST_HEAD(&dentry->d_alias); INIT_LIST_HEAD(&dentry->d_u.d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); + dentry->dentry_cache = current_dcache; this_cpu_inc(nr_dentry); @@ -2945,6 +2953,35 @@ static void __init dcache_init_early(void) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } +static struct kmem_cache *dcache_new_kmem(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + char *name; + + if (!parent) + return dentry_cache; + + name = kasprintf(GFP_KERNEL, "memcg-dentry-%p", memcg); + + return kmem_cache_create(name, sizeof(struct dentry), + __alignof__(struct dentry), + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD, NULL); +} + +static void dcache_destroy_kmem(struct kmem_cache *cachep) +{ + if (cachep == dentry_cache) + return; + + kfree(cachep->name); + kmem_cache_destroy(cachep); +} + +static struct memcg_slab_ctlr dcache_ctlr = { + .memcg_create_kmem_cache = dcache_new_kmem, + .memcg_destroy_kmem_cache = dcache_destroy_kmem, +}; + static void __init dcache_init(void) { int loop; @@ -2957,6 +2994,8 @@ static void __init dcache_init(void) dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + register_memcg_slab_ctlr(&dcache_ctlr); + /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 62157c0..a002292 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -142,6 +142,7 @@ struct dentry { } d_u; struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ + struct kmem_cache *dentry_cache; }; /* diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 343bd76..1b7ef0c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -158,6 +158,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); bool mem_cgroup_bad_page_check(struct page *page); void mem_cgroup_print_bad_page(struct page *page); #endif + +struct kmem_cache *current_kmem_cache(int token); + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -376,5 +379,33 @@ mem_cgroup_print_bad_page(struct page *page) } #endif +struct memcg_slab_ctlr { + struct list_head list; + struct kmem_cache *cachep; + struct kmem_cache *(*memcg_create_kmem_cache)(struct mem_cgroup *memcg, + struct mem_cgroup *parent); + void (*memcg_destroy_kmem_cache)(struct kmem_cache *cachep); + int token; +}; + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_kmem_charge(struct kmem_cache *cachep, gfp_t flags, int nr_pages); +void mem_cgroup_kmem_uncharge(struct kmem_cache *cachep, int nr_pages); +void register_memcg_slab_ctlr(struct memcg_slab_ctlr *ctlr); + +#else +static inline int mem_cgroup_kmem_charge(struct kmem_cache *cachep, gfp_t flags, int nr_pages) +{ + return 0; +} +static inline void mem_cgroup_kmem_uncharge(struct kmem_cache *cachep, int nr_pages) +{ +} + +static inline void register_memcg_slab_ctlr(struct memcg_slab_ctlr *ctlr) +{ +} +#endif + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 573c809..0bdefe2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -103,6 +103,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); +int kmem_cache_shrink_pages(struct kmem_cache *, int nr_pages); void kmem_cache_free(struct kmem_cache *, void *); unsigned int kmem_cache_size(struct kmem_cache *); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index d00e0ba..1ec3bc0 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -23,6 +23,8 @@ * manages a cache. */ +struct mem_cgroup; + struct kmem_cache { /* 1) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; @@ -55,6 +57,8 @@ struct kmem_cache { /* 4) cache creation/removal */ const char *name; struct list_head next; + struct mem_cgroup *memcg; + struct list_head memcg_list; /* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB @@ -111,6 +115,15 @@ extern struct cache_sizes malloc_sizes[]; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); +static inline void slab_associate_memcg(struct kmem_cache *kmem, struct mem_cgroup *memcg) +{ + kmem->memcg = memcg; +} +static inline struct mem_cgroup *slab_cache_memcg(struct kmem_cache *kmem) +{ + return kmem->memcg; +} + #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71de028..616e9f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -294,8 +294,16 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + + struct kmem_cache *kmem_caches[1024]; + +// struct kmem_cache *dentry_cache; }; +static DEFINE_RWLOCK(memcg_slab_lock); +static LIST_HEAD(memcg_slab_list); +static int memcg_token = 0; + /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a @@ -2730,6 +2738,84 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; } +/* this header is purely bogus */ +int mem_cgroup_kmem_charge(struct kmem_cache *cachep, gfp_t gfp_mask, int pages) +{ + struct mem_cgroup *memcg = slab_cache_memcg(cachep); + struct res_counter *dummy; + unsigned long long excess; + int ret, shrink_attempts; + + if (!memcg) + return 0; + + ret = res_counter_charge(&memcg->kmem, PAGE_SIZE * pages, &dummy); + + /* FIXME: This could very well come from charge for free!! */ + excess = res_counter_soft_limit_excess(&memcg->kmem) >> PAGE_SHIFT; + + /* + * If we really have a total kmem cgroup, this one assumes the current + * allocator is the one to blame. It might not be ideal in some cases, + * but it should get it right in general. An alternative approach would + * be to loop the following code through all registered caches in this + * memcg. + */ + if (!excess) + return ret; + + /* + * root memcg should never be in excess, thus, never triggering any + * reclaim + */ + WARN_ON(mem_cgroup_is_root(memcg)); + + /* + * First, we try a generic reclaim. This only touches the current + * cachep, not other users in the system. However, it doesn't know + * anything about the objects, and is not guaranteed to do a good job. + * + */ + excess -= kmem_cache_shrink_pages(cachep, excess << 1); + + shrink_attempts = 0; + /* + * Second attempt: If we don't think we freed enough + * entities, try the shrinkers. 5 attempts is pretty arbitrary. + */ + while (excess > 0 && shrink_attempts++ < 5) { + int nr; + + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; + + /* + * We need a better reclaim mechanism that + * at least does not punish other cgroups + */ + nr = shrink_slab(&shrink, 1000, 1000); + if (!nr) + break; + } + + /* After all these shrinks, worth another attempt */ + if (ret) + ret = res_counter_charge(&memcg->kmem, PAGE_SIZE * pages, &dummy); + + return ret; +} + +void mem_cgroup_kmem_uncharge(struct kmem_cache *cachep, int pages) +{ + struct mem_cgroup *memcg = slab_cache_memcg(cachep); + + if (!memcg) + return; + + res_counter_uncharge(&memcg->kmem, PAGE_SIZE * pages); +} + int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -3928,6 +4014,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); + else if (type == _KMEM) + WARN_ON(res_counter_set_limit(&memcg->kmem, val)); else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; @@ -3942,6 +4030,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, */ if (type == _MEM) ret = res_counter_set_soft_limit(&memcg->res, val); + else if (type == _KMEM) + WARN_ON(res_counter_set_soft_limit(&memcg->kmem, val)); else ret = -EINVAL; break; @@ -3998,6 +4088,8 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&mem->res); + else if (type == _KMEM) + res_counter_reset_failcnt(&mem->kmem); else res_counter_reset_failcnt(&mem->memsw); break; @@ -4754,6 +4846,7 @@ static struct cftype memsw_cgroup_files[] = { .trigger = mem_cgroup_reset, .read_u64 = mem_cgroup_read, }, + }; static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4786,8 +4879,28 @@ static struct cftype kmem_cgroup_files[] = { { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, + { + .name = "kmem.soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_SOFT_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read, + }, + }; static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4966,6 +5079,65 @@ static int mem_cgroup_soft_limit_tree_init(void) return 0; } +struct kmem_cache *current_kmem_cache(int token) +{ + struct mem_cgroup *memcg; + struct kmem_cache *cachep = NULL; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + + if (!memcg) + goto out; + + cachep = memcg->kmem_caches[token]; +out: + rcu_read_unlock(); + return cachep; +} + +static void create_slab_caches(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + struct memcg_slab_ctlr *ctlr; + read_lock(&memcg_slab_lock); + + list_for_each_entry(ctlr, &memcg_slab_list, list) { + struct kmem_cache *cachep; + cachep = ctlr->memcg_create_kmem_cache(memcg, parent); + memcg->kmem_caches[ctlr->token] = cachep; + slab_associate_memcg(cachep, memcg); + } + + read_unlock(&memcg_slab_lock); +} + +static void destroy_slab_caches(struct mem_cgroup *memcg) +{ + struct memcg_slab_ctlr *ctlr; + + read_lock(&memcg_slab_lock); + list_for_each_entry(ctlr, &memcg_slab_list, list) + ctlr->memcg_destroy_kmem_cache(memcg->kmem_caches[ctlr->token]); + + read_unlock(&memcg_slab_lock); +} + +void register_memcg_slab_ctlr(struct memcg_slab_ctlr *ctlr) +{ + int tk; + + write_lock(&memcg_slab_lock); + tk = memcg_token++; + if (tk > 1024) { + printk(KERN_WARNING "Too many slab caches for kmem cgroup\n"); + goto out; + } + INIT_LIST_HEAD(&ctlr->list); + list_add(&ctlr->list, &memcg_slab_list); + ctlr->token = tk; +out: + write_unlock(&memcg_slab_lock); +} static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -4995,11 +5167,14 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) INIT_WORK(&stock->work, drain_local_stock); } hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + //mem->dentry_cache = dentry_cache; } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; mem->oom_kill_disable = parent->oom_kill_disable; + //mem->dentry_cache = new_dcache(); } + create_slab_caches(mem, parent); if (parent && parent->use_hierarchy) { res_counter_init(&mem->res, &parent->res); @@ -5046,6 +5221,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + destroy_slab_caches(mem); + mem_cgroup_put(mem); } diff --git a/mm/slab.c b/mm/slab.c index 6d90a09..d83f312 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1742,6 +1742,12 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) return NULL; nr_pages = (1 << cachep->gfporder); + + if (mem_cgroup_kmem_charge(cachep, flags, nr_pages)) { + free_pages((unsigned long)page_address(page), cachep->gfporder); + return NULL; + } + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), NR_SLAB_RECLAIMABLE, nr_pages); @@ -1787,6 +1793,9 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) } if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; + + mem_cgroup_kmem_uncharge(cachep, nr_freed); + free_pages((unsigned long)addr, cachep->gfporder); } @@ -2604,6 +2613,29 @@ out: } /* Called with cache_chain_mutex held to protect against cpu hotplug */ +static int __cache_shrink_pages(struct kmem_cache *cachep, int pages) +{ + int i = 0; + struct kmem_list3 *l3; + int freed = 0; + + drain_cpu_caches(cachep); + + check_irq_on(); + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (!l3) + continue; + + freed += drain_freelist(cachep, l3, l3->free_objects); + if (freed >= pages) + break; + } + + return freed; +} + +/* Called with cache_chain_mutex held to protect against cpu hotplug */ static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; @@ -2622,8 +2654,29 @@ static int __cache_shrink(struct kmem_cache *cachep) ret += !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); } + return (ret ? 1 : 0); } +/** + * kmem_cache_shrink_pages - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink_pages(struct kmem_cache *cachep, int nr_pages) +{ + int ret; + BUG_ON(!cachep || in_interrupt()); + + get_online_cpus(); + mutex_lock(&cache_chain_mutex); + ret = __cache_shrink_pages(cachep, nr_pages); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink_pages); /** * kmem_cache_shrink - Shrink a cache. @@ -2643,6 +2696,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) mutex_unlock(&cache_chain_mutex); put_online_cpus(); return ret; + } EXPORT_SYMBOL(kmem_cache_shrink);