[Devel] [PATCH RHEL7 COMMIT] mm/memcg: reclaim only kmem if kmem limit reached

Thu Aug 31 13:03:24 MSK 2017

The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.35.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.35.5
------>
commit aa84e9472d88646f993f8bf1f2eb03a6abad93cd
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Thu Aug 31 13:03:24 2017 +0300

    mm/memcg: reclaim only kmem if kmem limit reached
    
    If kmem limit on memcg reached, we go into memory reclaim,
    and reclaim everything we can, including page cache and anon.
    Reclaiming page cache or anon won't help since we need to lower
    only kmem usage. This patch fixes the problem by avoiding
    non-kmem reclaim on hitting the kmem limit.
    
    https://jira.sw.ru/browse/PSBM-69226
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/linux/memcontrol.h | 10 ++++++++++
 include/linux/swap.h       |  2 +-
 mm/memcontrol.c            | 30 ++++++++++++++++--------------
 mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
 4 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1a52e58..1d6bc80 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
+#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
+#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
+
 #ifdef CONFIG_MEMCG
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bd162f9..bd47451 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  unsigned long nr_pages,
-						  gfp_t gfp_mask, bool noswap);
+						  gfp_t gfp_mask, int flags);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09ce016..5372151 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -511,16 +511,6 @@ enum res_type {
 #define OOM_CONTROL		(0)
 
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
-#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
-#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
-
-/*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
@@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 		if (loop)
 			drain_all_stock_async(memcg);
 		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
-						      gfp_mask, noswap);
+						      gfp_mask, flags);
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    fatal_signal_pending(current))
 			return 1;
@@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 			break;
 		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
 			break;
+
+		/*
+		 * Try harder to reclaim dcache. dcache reclaim may
+		 * temporarly fail due to dcache->dlock being held
+		 * by someone else. We must try harder to avoid premature
+		 * slab allocation failures.
+		 */
+		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
+		    page_counter_read(&memcg->dcache))
+			continue;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
@@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
-	unsigned long flags = 0;
+	unsigned long flags;
 
 	if (mem_cgroup_is_root(memcg))
 		goto done;
 retry:
+	flags = 0;
+
 	if (consume_stock(memcg, nr_pages)) {
 		if (!kmem_charge)
 			goto done;
@@ -4140,7 +4142,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
-							GFP_KERNEL, false);
+							GFP_KERNEL, 0);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -4575,7 +4577,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
 	usage = page_counter_read(&memcg->memory);
 	if (usage > nr_pages)
 		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
-					     GFP_KERNEL, false);
+					     GFP_KERNEL, 0);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 277bd37..a5db594 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 
+	/* Reclaim only slab */
+	bool slab_only;
+
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
@@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
 	gfp_t slab_gfp = sc->gfp_mask;
+	bool slab_only = sc->slab_only;
 
 	/* Disable fs-related IO for direct reclaim */
 	if (!sc->target_mem_cgroup &&
@@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
 				continue;
 
-			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			scanned = sc->nr_scanned;
-			shrink_lruvec(lruvec, sc, &lru_pages);
-			zone_lru_pages += lru_pages;
 
-			if (memcg && is_classzone)
+			if (!slab_only) {
+				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+				shrink_lruvec(lruvec, sc, &lru_pages);
+				zone_lru_pages += lru_pages;
+			}
+
+			if (memcg && is_classzone) {
 				shrink_slab(slab_gfp, zone_to_nid(zone),
 					    memcg, sc->priority, false);
+				if (reclaim_state) {
+					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+					sc->nr_scanned += reclaim_state->reclaimed_slab;
+					reclaim_state->reclaimed_slab = 0;
+				}
+
+			}
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool noswap)
+					   int flags)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	struct reclaim_state reclaim_state = { 0 };
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = !noswap,
+		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
+		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
@@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 
+	current->reclaim_state = &reclaim_state;
 	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
-
+	current->reclaim_state = NULL;
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
 	return nr_reclaimed;