[Devel] [PATCH RHEL7 COMMIT] ms/mm: memcontrol: revert use of root_mem_cgroup res_counter
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Jan 16 08:27:16 PST 2017
The commit is pushed to "branch-rh7-3.10.0-514.vz7.27.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.vz7.27.10
------>
commit b47182055b23197e9037214bdd631bfb73bf251c
Author: Johannes Weiner <hannes at cmpxchg.org>
Date: Mon Jan 16 20:27:16 2017 +0400
ms/mm: memcontrol: revert use of root_mem_cgroup res_counter
Dave Hansen reports a massive scalability regression in an uncontained
page fault benchmark with more than 30 concurrent threads, which he
bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
res_counter") and pin-pointed on res_counter spinlock contention.
That change relied on the per-cpu charge caches to mostly swallow the
res_counter costs, but it's apparent that the caches don't scale yet.
Revert memcg back to bypassing res_counters on the root level in order
to restore performance for uncontained workloads.
Reported-by: Dave Hansen <dave at sr71.net>
Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
Tested-by: Dave Hansen <dave.hansen at intel.com>
Acked-by: Michal Hocko <mhocko at suse.cz>
Acked-by: Vladimir Davydov <vdavydov at parallels.com>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit ce00a967377baadf2481521e131771adc7652856)
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
mm/memcontrol.c | 73 ++++++++++++++++++++++++++++++++-------------------------
1 file changed, 41 insertions(+), 32 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 16bb6aa..6c11788 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4158,8 +4158,8 @@ out:
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
long val = 0;
@@ -4173,6 +4173,30 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ u64 val;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (!swap)
+ return page_counter_read(&memcg->memory);
+ else
+ return page_counter_read(&memcg->memsw);
+ }
+
+ /*
+ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+ * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+ */
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+ if (swap)
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+ return val << PAGE_SHIFT;
+}
+
void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
{
int nid;
@@ -4181,12 +4205,12 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
for_each_online_node(nid)
mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
- mi->slab_reclaimable = tree_stat(memcg,
+ mi->slab_reclaimable = mem_cgroup_recursive_stat(memcg,
MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
- mi->slab_unreclaimable = tree_stat(memcg,
+ mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
- mi->cached = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- mi->shmem = tree_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+ mi->cached = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
}
int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
@@ -4200,33 +4224,15 @@ int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
free += page_counter_read(&memcg->dcache);
/* assume file cache is reclaimable */
- free += tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
/* but do not count shmem pages as they can't be purged,
* only swapped out */
- free -= tree_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+ free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
return free < pages ? -ENOMEM : 0;
}
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
- u64 val;
-
- if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
- } else {
- if (!swap)
- val = page_counter_read(&memcg->memory);
- else
- val = page_counter_read(&memcg->memsw);
- }
- return val << PAGE_SHIFT;
-}
-
enum {
RES_USAGE,
RES_LIMIT,
@@ -6760,7 +6766,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
if (memcg) {
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memsw, 1);
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
@@ -6919,12 +6926,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
{
unsigned long flags;
- if (nr_mem)
- page_counter_uncharge(&memcg->memory, nr_mem);
- if (nr_memsw)
- page_counter_uncharge(&memcg->memsw, nr_memsw);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (nr_mem)
+ page_counter_uncharge(&memcg->memory, nr_mem);
+ if (nr_memsw)
+ page_counter_uncharge(&memcg->memsw, nr_memsw);
- memcg_oom_recover(memcg);
+ memcg_oom_recover(memcg);
+ }
local_irq_save(flags);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
More information about the Devel
mailing list