[Devel] [PATCH RHEL7 COMMIT] ms/mm: memcontrol: fix transparent huge page allocations under pressure

Wed Jan 31 18:48:02 MSK 2018

The commit is pushed to "branch-rh7-3.10.0-693.11.6.vz7.42.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.11.6.vz7.42.4
------>
commit d39d3862ddeb1ddfc77b53698c94eadc06c75f5c
Author: Johannes Weiner <hannes at cmpxchg.org>
Date:   Wed Jan 31 18:48:02 2018 +0300

    ms/mm: memcontrol: fix transparent huge page allocations under pressure
    
    In a memcg with even just moderate cache pressure, success rates for
    transparent huge page allocations drop to zero, wasting a lot of effort
    that the allocator puts into assembling these pages.
    
    The reason for this is that the memcg reclaim code was never designed for
    higher-order charges.  It reclaims in small batches until there is room
    for at least one page.  Huge page charges only succeed when these batches
    add up over a series of huge faults, which is unlikely under any
    significant load involving order-0 allocations in the group.
    
    Remove that loop on the memcg side in favor of passing the actual reclaim
    goal to direct reclaim, which is already set up and optimized to meet
    higher-order goals efficiently.
    
    This brings memcg's THP policy in line with the system policy: if the
    allocator painstakingly assembles a hugepage, memcg will at least make an
    honest effort to charge it.  As a result, transparent hugepage allocation
    rates amid cache activity are drastically improved:
    
                                          vanilla                 patched
    pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
    pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
    pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
    thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
    thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
    
    [ Note: this may in turn increase memory consumption from internal
      fragmentation, which is an inherent risk of transparent hugepages.
      Some setups may have to adjust the memcg limits accordingly to
      accomodate this - or, if the machine is already packed to capacity,
      disable the transparent huge page feature. ]
    
    Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
    Reviewed-by: Vladimir Davydov <vdavydov at parallels.com>
    Cc: Michal Hocko <mhocko at suse.cz>
    Cc: Dave Hansen <dave at sr71.net>
    Cc: Greg Thelen <gthelen at google.com>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-80732
    (cherry picked from commit b70a2a21dc9d4ad455931b53131a0cb4fc01fafe)
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/memcontrol.c | 70 +++++++++++++--------------------------------------------
 1 file changed, 15 insertions(+), 55 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5396157488ec..8a46ce2483a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2150,54 +2150,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			 NULL, "Memory cgroup out of memory");
 }
 
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
-					gfp_t gfp_mask,
-					unsigned long flags)
-{
-	unsigned long total = 0;
-	bool noswap = false;
-	int loop;
-
-	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
-		noswap = true;
-
-	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
-		if (loop)
-			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
-						      gfp_mask, flags);
-		if (test_thread_flag(TIF_MEMDIE) ||
-		    fatal_signal_pending(current))
-			return 1;
-		/*
-		 * Allow limit shrinkers, which are triggered directly
-		 * by userspace, to catch signals and stop reclaim
-		 * after minimal progress, regardless of the margin.
-		 */
-		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
-			break;
-		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
-			break;
-
-		/*
-		 * Try harder to reclaim dcache. dcache reclaim may
-		 * temporarly fail due to dcache->dlock being held
-		 * by someone else. We must try harder to avoid premature
-		 * slab allocation failures.
-		 */
-		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
-		    page_counter_read(&memcg->dcache))
-			continue;
-		/*
-		 * If nothing was reclaimed after two attempts, there
-		 * may be no reclaimable pages in this hierarchy.
-		 */
-		if (loop && !total)
-			break;
-	}
-	return total;
-}
-
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
@@ -2817,6 +2769,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	unsigned long flags;
+	bool drained = false;
 
 	if (mem_cgroup_is_root(memcg))
 		goto done;
@@ -2908,12 +2861,19 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nomem;
 
-	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+						    gfp_mask, flags);
 
 	if (mem_cgroup_margin(mem_over_limit,
 				flags & MEM_CGROUP_RECLAIM_KMEM) >= batch)
 		goto retry;
 
+	if (!drained) {
+		drain_all_stock_async(mem_over_limit);
+		drained = true;
+		goto retry;
+	}
+
 	if (gfp_mask & __GFP_NORETRY)
 		goto nomem;
 	/*
@@ -3877,8 +3837,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		mem_cgroup_reclaim(memcg, GFP_KERNEL,
-				   MEM_CGROUP_RECLAIM_SHRINK);
+		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 0);
+
 		curusage = page_counter_read(&memcg->memory);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
@@ -3929,9 +3889,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		if (!ret)
 			break;
 
-		mem_cgroup_reclaim(memcg, GFP_KERNEL,
-				   MEM_CGROUP_RECLAIM_NOSWAP |
-				   MEM_CGROUP_RECLAIM_SHRINK);
+		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+					MEM_CGROUP_RECLAIM_NOSWAP);
+
 		curusage = page_counter_read(&memcg->memsw);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
@@ -4189,7 +4149,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+		progress = try_to_free_mem_cgroup_pages(memcg, 1,
 							GFP_KERNEL, 0);
 		if (!progress) {
 			nr_retries--;