[Devel] [PATCH rh7 2/2] mm/memcg: improve mem_cgroup_dcache_is_low() performance.

Tue Jul 25 14:28:36 MSK 2017

mem_cgroup_dcache_is_low() is called during memory reclaim for every
mem cgroup, but it's awfully slow. It iterates through every possible
cpu to collect anon, file and slab reclaimable counters.

Switch to percpu_counter anon,file and slab reclaimable counters.
This allows to read them by doing a single load instead of itearting
over all cpus.

https://jira.sw.ru/browse/PSBM-68644
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/memcontrol.c | 108 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 24 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9678957bf22..11f9bc07e41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -95,28 +95,38 @@ enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
-	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SHMEM,		/* # of charged shmem pages */
-	MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
 	MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE, /* # of unreclaimable slab pages */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 
+enum mem_cgroup_stat2_index {
+	/*
+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+	 */
+	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
+	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
+	MEM_CGROUP_STAT2_NSTATS,
+};
+
 static const char * const mem_cgroup_stat_names[] = {
-	"cache",
-	"rss",
 	"rss_huge",
 	"mapped_file",
 	"shmem",
-	"slab_reclaimable",
 	"slab_unreclaimable",
 	"swap",
 };
 
+static const char * const mem_cgroup_stat2_names[] = {
+	"cache",
+	"rss",
+	"slab_reclaimable",
+};
+
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
@@ -167,6 +177,10 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
+struct mem_cgroup_stat2_cpu {
+	struct percpu_counter counters[MEM_CGROUP_STAT2_NSTATS];
+};
+
 struct mem_cgroup_reclaim_iter {
 	/*
 	 * last scanned hierarchy member. Valid only if last_dead_count
@@ -368,6 +382,7 @@ struct mem_cgroup {
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
+	struct mem_cgroup_stat2_cpu stat2;
 	spinlock_t pcp_counter_lock;
 
 	atomic_t	dead_count;
@@ -956,6 +971,11 @@ mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 		val = 0;
 	return val;
 }
+static inline unsigned long
+mem_cgroup_read_stat2(struct mem_cgroup *memcg, enum mem_cgroup_stat2_index idx)
+{
+	return percpu_counter_read_positive(&memcg->stat2.counters[idx]);
+}
 
 static void mem_cgroup_update_swap_max(struct mem_cgroup *memcg)
 {
@@ -1013,10 +1033,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (PageAnon(page))
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+		percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS],
 				nr_pages);
 	else {
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+		percpu_counter_add(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
 		if (PageSwapBacked(page))
 			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
@@ -1593,9 +1613,9 @@ bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg)
 	if (vfs_cache_min_ratio <= 0)
 		return false;
 
-	anon = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
-	file = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
-	dcache = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+	anon = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_RSS);
+	file = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+	dcache = mem_cgroup_read_stat2(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
 
 	return dcache / vfs_cache_min_ratio <
 			(anon + file + dcache) / 100;
@@ -1979,6 +1999,10 @@ done:
 			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
 		}
+		for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+			pr_cont(" %s:%luKB", mem_cgroup_stat2_names[i],
+				K(mem_cgroup_read_stat2(iter, i)));
+		}
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
@@ -3120,10 +3144,11 @@ int __memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, unsigned int nr_pages)
 	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
 		page_counter_charge(&memcg->dcache, nr_pages);
 		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
-	} else
+		percpu_counter_add(&memcg->stat2.counters[idx], nr_pages);
+	} else {
 		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
-
-	this_cpu_add(memcg->stat->count[idx], nr_pages);
+		this_cpu_add(memcg->stat->count[idx], nr_pages);
+	}
 	return 0;
 }
 
@@ -3139,10 +3164,11 @@ void __memcg_uncharge_slab(struct kmem_cache *s, unsigned int nr_pages)
 	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
 		page_counter_uncharge(&memcg->dcache, nr_pages);
 		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
-	} else
+		percpu_counter_sub(&memcg->stat2.counters[idx], nr_pages);
+	} else {
 		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
-
-	this_cpu_sub(memcg->stat->count[idx], nr_pages);
+		this_cpu_sub(memcg->stat->count[idx], nr_pages);
+	}
 }
 
 /*
@@ -4195,6 +4221,17 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
 
 	return val;
 }
+static unsigned long mem_cgroup_recursive_stat2(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat2_index idx)
+{
+	struct mem_cgroup *iter;
+	unsigned long val = 0;
+
+	for_each_mem_cgroup_tree(iter, memcg)
+		val += mem_cgroup_read_stat2(iter, idx);
+
+	return val;
+}
 
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
@@ -4211,8 +4248,8 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
 	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
 	 */
-	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+	val = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_RSS);
 
 	if (swap)
 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
@@ -4228,11 +4265,11 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 	for_each_online_node(nid)
 		mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
 
-	mi->slab_reclaimable = mem_cgroup_recursive_stat(memcg,
+	mi->slab_reclaimable = mem_cgroup_recursive_stat2(memcg,
 					MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
 	mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
 					MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
-	mi->cached = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	mi->cached = mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
 	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
 }
 
@@ -4247,7 +4284,7 @@ int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
 	free += page_counter_read(&memcg->dcache);
 
 	/* assume file cache is reclaimable */
-	free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	free += mem_cgroup_recursive_stat2(memcg, MEM_CGROUP_STAT_CACHE);
 
 	/* but do not count shmem pages as they can't be purged,
 	 * only swapped out */
@@ -5094,6 +5131,10 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 		seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		seq_printf(m, "%s %lu\n", mem_cgroup_stat2_names[i],
+			   mem_cgroup_read_stat2(memcg, i) * PAGE_SIZE);
+	}
 
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
@@ -5124,6 +5165,13 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
 	}
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		unsigned long long val = 0;
+
+		for_each_mem_cgroup_tree(mi, memcg)
+			val += mem_cgroup_read_stat2(mi, i) * PAGE_SIZE;
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat2_names[i], val);
+	}
 
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long long val = 0;
@@ -5858,6 +5906,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
 	size_t size;
+	int i, ret;
 
 	size = sizeof(struct mem_cgroup);
 	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -5869,9 +5918,20 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
+
+	for (i = 0; i < MEM_CGROUP_STAT2_NSTATS; i++) {
+		ret = percpu_counter_init(&memcg->stat2.counters[i], 0, GFP_KERNEL);
+		if (ret)
+			goto out_pcpu_free;
+	}
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 
+out_pcpu_free:
+	while (--i >= 0)
+		percpu_counter_destroy(&memcg->stat2.counters[i]);
+
+	free_percpu(memcg->stat);
 out_free:
 	kfree(memcg);
 	return NULL;
@@ -7017,8 +7077,8 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 	}
 
 	local_irq_save(flags);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+	percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_RSS], nr_anon);
+	percpu_counter_sub(&memcg->stat2.counters[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
-- 
2.13.0