[Devel] [PATCH RHEL7 COMMIT] ms/mm: memcontrol: fold mem_cgroup_do_charge()

Mon Jan 16 08:27:07 PST 2017

The commit is pushed to "branch-rh7-3.10.0-514.vz7.27.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.vz7.27.10
------>
commit 64a6f65d36172c987ac23d6ecb1621d1953cf57b
Author: Johannes Weiner <hannes at cmpxchg.org>
Date:   Mon Jan 16 20:27:07 2017 +0400

    ms/mm: memcontrol: fold mem_cgroup_do_charge()
    
    These patches rework memcg charge lifetime to integrate more naturally
    with the lifetime of user pages.  This drastically simplifies the code
    and reduces charging and uncharging overhead.  The most expensive part
    of charging and uncharging is the page_cgroup bit spinlock, which is
    removed entirely after this series.
    
    Here are the top-10 profile entries of a stress test that reads a 128G
    sparse file on a freshly booted box, without even a dedicated cgroup
    (i.e. executing in the root memcg).  Before:
    
        15.36%              cat  [kernel.kallsyms]   [k] copy_user_generic_string
        13.31%              cat  [kernel.kallsyms]   [k] memset
        11.48%              cat  [kernel.kallsyms]   [k] do_mpage_readpage
         4.23%              cat  [kernel.kallsyms]   [k] get_page_from_freelist
         2.38%              cat  [kernel.kallsyms]   [k] put_page
         2.32%              cat  [kernel.kallsyms]   [k] __mem_cgroup_commit_charge
         2.18%          kswapd0  [kernel.kallsyms]   [k] __mem_cgroup_uncharge_common
         1.92%          kswapd0  [kernel.kallsyms]   [k] shrink_page_list
         1.86%              cat  [kernel.kallsyms]   [k] __radix_tree_lookup
         1.62%              cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn
    
    After:
    
        15.67%           cat  [kernel.kallsyms]   [k] copy_user_generic_string
        13.48%           cat  [kernel.kallsyms]   [k] memset
        11.42%           cat  [kernel.kallsyms]   [k] do_mpage_readpage
         3.98%           cat  [kernel.kallsyms]   [k] get_page_from_freelist
         2.46%           cat  [kernel.kallsyms]   [k] put_page
         2.13%       kswapd0  [kernel.kallsyms]   [k] shrink_page_list
         1.88%           cat  [kernel.kallsyms]   [k] __radix_tree_lookup
         1.67%           cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn
         1.39%       kswapd0  [kernel.kallsyms]   [k] free_pcppages_bulk
         1.30%           cat  [kernel.kallsyms]   [k] kfree
    
    As you can see, the memcg footprint has shrunk quite a bit.
    
       text    data     bss     dec     hex filename
      37970    9892     400   48262    bc86 mm/memcontrol.o.old
      35239    9892     400   45531    b1db mm/memcontrol.o
    
    This patch (of 13):
    
    This function was split out because mem_cgroup_try_charge() got too big.
    But having essentially one sequence of operations arbitrarily split in
    half is not good for reworking the code.  Fold it back in.
    
    Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
    Acked-by: Michal Hocko <mhocko at suse.cz>
    Cc: Hugh Dickins <hughd at google.com>
    Cc: Tejun Heo <tj at kernel.org>
    Cc: Vladimir Davydov <vdavydov at parallels.com>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-51558
    (cherry picked from commit 6539cc053869bd32a2db731b215b7c73b11f68d3)
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/memcontrol.c | 191 +++++++++++++++++---------------------------------------
 1 file changed, 57 insertions(+), 134 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1e5d914..f904257 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2744,86 +2744,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-
-/* See mem_cgroup_try_charge() for details */
-enum {
-	CHARGE_OK,		/* success */
-	CHARGE_RETRY,		/* need to retry but retry is not bad */
-	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
-	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
-};
-
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-				unsigned int nr_pages, unsigned int min_pages,
-				bool invoke_oom)
-{
-	struct mem_cgroup *mem_over_limit;
-	struct page_counter *counter;
-	unsigned long flags = 0;
-	int ret;
-
-	ret = page_counter_try_charge(&memcg->memory, nr_pages, &counter);
-
-	if (likely(!ret)) {
-		if (!do_swap_account)
-			return CHARGE_OK;
-		ret = page_counter_try_charge(&memcg->memsw, nr_pages, &counter);
-		if (likely(!ret))
-			return CHARGE_OK;
-
-		page_counter_uncharge(&memcg->memory, nr_pages);
-		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-	} else
-		mem_over_limit = mem_cgroup_from_counter(counter, memory);
-	/*
-	 * Never reclaim on behalf of optional batching, retry with a
-	 * single page instead.
-	 */
-	if (nr_pages > min_pages)
-		return CHARGE_RETRY;
-
-	if (!(gfp_mask & __GFP_WAIT)) {
-		mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
-		return CHARGE_WOULDBLOCK;
-	}
-
-	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-		return CHARGE_RETRY;
-
-	if (gfp_mask & __GFP_NORETRY) {
-		mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
-		return CHARGE_WOULDBLOCK;
-	}
-	/*
-	 * Even though the limit is exceeded at this point, reclaim
-	 * may have been able to free some pages.  Retry the charge
-	 * before killing the task.
-	 *
-	 * Only for regular pages, though: huge pages are rather
-	 * unlikely to succeed so close to the limit, and we fall back
-	 * to regular pages anyway in case of failure.
-	 */
-	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
-		return CHARGE_RETRY;
-
-	/*
-	 * At task move, charge accounts can be doubly counted. So, it's
-	 * better to wait until the end of task_move if something is going on.
-	 */
-	if (mem_cgroup_wait_acct_move(mem_over_limit))
-		return CHARGE_RETRY;
-
-	if (invoke_oom) {
-		mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
-		mem_cgroup_oom(mem_over_limit, gfp_mask,
-			       get_order(nr_pages * PAGE_SIZE));
-	}
-
-	return CHARGE_NOMEM;
-}
-
 /**
  * mem_cgroup_try_charge - try charging a memcg
  * @memcg: memcg to charge
@@ -2840,8 +2760,10 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct mem_cgroup *iter;
-	int ret;
+	struct mem_cgroup *mem_over_limit;
+	struct page_counter *counter;
+	unsigned long nr_reclaimed;
+	unsigned long flags = 0;
 
 	if (mem_cgroup_is_root(memcg))
 		goto done;
@@ -2860,76 +2782,77 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 
 	if (gfp_mask & __GFP_NOFAIL)
 		oom = false;
-again:
+retry:
 	if (consume_stock(memcg, nr_pages))
 		goto done;
 
-	do {
-		bool invoke_oom = oom && !nr_oom_retries;
+	if (!page_counter_try_charge(&memcg->memory, batch, &counter)) {
+		if (!do_swap_account)
+			goto done_restock;
+		if (!page_counter_try_charge(&memcg->memsw, batch, &counter))
+			goto done_restock;
+		page_counter_uncharge(&memcg->memory, batch);
+		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
+		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+	} else
+		mem_over_limit = mem_cgroup_from_counter(counter, memory);
 
-		/* If killed, bypass charge */
-		if (test_thread_flag(TIF_MEMDIE) ||
-		    fatal_signal_pending(current))
-			goto bypass;
+	if (batch > nr_pages) {
+		batch = nr_pages;
+		goto retry;
+	}
 
-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
-					   nr_pages, invoke_oom);
-		switch (ret) {
-		case CHARGE_OK:
-			break;
-		case CHARGE_RETRY: /* not in OOM situation but retry */
-			batch = nr_pages;
-			goto again;
-		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-			goto nomem;
-		case CHARGE_NOMEM: /* OOM routine works */
-			if (!oom || invoke_oom)
-				goto nomem;
-			nr_oom_retries--;
-			break;
-		}
-	} while (ret != CHARGE_OK);
+	if (!(gfp_mask & __GFP_WAIT))
+		goto nomem;
 
+	if (gfp_mask & __GFP_NORETRY)
+		goto nomem;
+
+	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+
+	if (mem_cgroup_margin(mem_over_limit) >= batch)
+		goto retry;
 	/*
-	 * Cancel charge in case this cgroup was destroyed while we were here,
-	 * otherwise we can get a pending user memory charge to an offline
-	 * cgroup, which might result in use-after-free after the cgroup gets
-	 * released (see also mem_cgroup_css_offline()).
+	 * Even though the limit is exceeded at this point, reclaim
+	 * may have been able to free some pages.  Retry the charge
+	 * before killing the task.
 	 *
-	 * Note, no need to issue an explicit barrier here, because a
-	 * successful charge implies full memory barrier.
+	 * Only for regular pages, though: huge pages are rather
+	 * unlikely to succeed so close to the limit, and we fall back
+	 * to regular pages anyway in case of failure.
 	 */
-	if (unlikely(memcg->is_offline)) {
-		page_counter_uncharge(&memcg->memory, batch);
-		if (do_swap_account)
-			page_counter_uncharge(&memcg->memsw, batch);
-		css_put(&memcg->css);
+	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+		goto retry;
+	/*
+	 * At task move, charge accounts can be doubly counted. So, it's
+	 * better to wait until the end of task_move if something is going on.
+	 */
+	if (mem_cgroup_wait_acct_move(mem_over_limit))
+		goto retry;
+
+	if (fatal_signal_pending(current))
 		goto bypass;
-	}
 
-	if (batch > nr_pages)
-		refill_stock(memcg, batch - nr_pages);
+	if (!oom)
+		goto nomem;
 
-	/*
-	 * If the hierarchy is above the normal consumption range,
-	 * make the charging task trim their excess contribution.
-	 */
-	iter = memcg;
-	do {
-		if (!(gfp_mask & __GFP_WAIT))
-			break;
-		if (page_counter_read(&iter->memory) <= iter->high)
-			continue;
-		try_to_free_mem_cgroup_pages(iter, nr_pages, gfp_mask, false);
-	} while ((iter = parent_mem_cgroup(iter)));
+	if (nr_oom_retries--)
+		goto retry;
+
+	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
+	mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
-done:
-	return 0;
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
 bypass:
 	return -EINTR;
+
+done_restock:
+	if (batch > nr_pages)
+		refill_stock(memcg, batch - nr_pages);
+done:
+	return 0;
 }
 
 /**