[Devel] [PATCH RHEL7 COMMIT] ms/mm: use sc->priority for slab shrink targets

Thu Aug 31 13:03:23 MSK 2017

The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.35.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.35.5
------>
commit 5a99b388025b5981f44c29588a22dc37607f990c
Author: Josef Bacik <jbacik at fb.com>
Date:   Thu Aug 31 13:03:23 2017 +0300

    ms/mm: use sc->priority for slab shrink targets
    
    Previously we were using the ratio of the number of lru pages scanned to
    the number of eligible lru pages to determine the number of slab objects
    to scan.  The problem with this is that these two things have nothing to
    do with each other, so in slab heavy work loads where there is little to
    no page cache we can end up with the pages scanned being a very low
    number.  This means that we reclaim next to no slab pages and waste a
    lot of time reclaiming small amounts of space.
    
    Instead use sc->priority in the same way we use it to determine scan
    amounts for the lru's.  This generally equates to pages.  Consider the
    following
    
    slab_pages = (nr_objects * object_size) / PAGE_SIZE
    
    What we would like to do is
    
    scan = slab_pages >> sc->priority
    
    but we don't know the number of slab pages each shrinker controls, only
    the objects.  However say that theoretically we knew how many pages a
    shrinker controlled, we'd still have to convert this to objects, which
    would look like the following
    
    scan = shrinker_pages >> sc->priority
    scan_objects = (PAGE_SIZE / object_size) * scan
    
    or written another way
    
    scan_objects = (shrinker_pages >> sc->priority) *
    		(PAGE_SIZE / object_size)
    
    which can thus be written
    
    scan_objects = ((shrinker_pages * PAGE_SIZE) / object_size) >>
    		sc->priority
    
    which is just
    
    scan_objects = nr_objects >> sc->priority
    
    We don't need to know exactly how many pages each shrinker represents,
    it's objects are all the information we need.  Making this change allows
    us to place an appropriate amount of pressure on the shrinker pools for
    their relative size.
    
    Signed-off-by: Josef Bacik <jbacik at fb.com>
    
    https://jira.sw.ru/browse/PSBM-69226
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/trace/events/vmscan.h | 23 ++++++++++------------
 mm/vmscan.c                   | 44 ++++++++++++-------------------------------
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 132a985..d98fb0a 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -181,23 +181,22 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
 
 TRACE_EVENT(mm_shrink_slab_start,
 	TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
-		long nr_objects_to_shrink, unsigned long pgs_scanned,
-		unsigned long lru_pgs, unsigned long cache_items,
-		unsigned long long delta, unsigned long total_scan),
+		long nr_objects_to_shrink, unsigned long cache_items,
+		unsigned long long delta, unsigned long total_scan,
+		int priority),
 
-	TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
-		cache_items, delta, total_scan),
+	TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
+		priority),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
 		__field(void *, shrink)
 		__field(long, nr_objects_to_shrink)
 		__field(gfp_t, gfp_flags)
-		__field(unsigned long, pgs_scanned)
-		__field(unsigned long, lru_pgs)
 		__field(unsigned long, cache_items)
 		__field(unsigned long long, delta)
 		__field(unsigned long, total_scan)
+		__field(int, priority)
 	),
 
 	TP_fast_assign(
@@ -205,23 +204,21 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__entry->shrink = shr->scan_objects;
 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
 		__entry->gfp_flags = sc->gfp_mask;
-		__entry->pgs_scanned = pgs_scanned;
-		__entry->lru_pgs = lru_pgs;
 		__entry->cache_items = cache_items;
 		__entry->delta = delta;
 		__entry->total_scan = total_scan;
+		__entry->priority = priority;
 	),
 
-	TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+	TP_printk("%pF %p: objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
-		__entry->pgs_scanned,
-		__entry->lru_pgs,
 		__entry->cache_items,
 		__entry->delta,
-		__entry->total_scan)
+		__entry->total_scan,
+		__entry->priority)
 );
 
 TRACE_EVENT(mm_shrink_slab_end,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b9e77c3..277bd37 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -223,9 +223,7 @@ EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
 
 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-				    struct shrinker *shrinker,
-				    unsigned long nr_scanned,
-				    unsigned long nr_eligible)
+				    struct shrinker *shrinker, int priority)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
@@ -249,9 +247,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 
 	total_scan = nr;
-	delta = nr_scanned / shrinker->seeks;
-	delta *= max_pass;
-	do_div(delta, nr_eligible + 1);
+	delta = max_pass >> priority;
+	delta = (4 * delta) / shrinker->seeks;
 	total_scan += delta;
 	if (total_scan < 0) {
 		printk(KERN_ERR
@@ -284,8 +281,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		total_scan = max_pass * 2;
 
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-				   nr_scanned, nr_eligible,
-				   max_pass, delta, total_scan);
+				   max_pass, delta, total_scan, priority);
 
 	while (total_scan >= batch_size) {
 		unsigned long ret;
@@ -325,8 +321,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  * @gfp_mask: allocation context
  * @nid: node whose slab caches to target
  * @memcg: memory cgroup whose slab caches to target
- * @nr_scanned: pressure numerator
- * @nr_eligible: pressure denominator
+ * @priority: the reclaim priority
  *
  * Call the shrink functions to age shrinkable caches.
  *
@@ -339,20 +334,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  * are called, and memcg aware shrinkers are supposed to scan the
  * global list then.
  *
- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * the available objects should be scanned.  Page reclaim for example
- * passes the number of pages scanned and the number of pages on the
- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
- * when it encountered mapped pages.  The ratio is further biased by
- * the ->seeks setting of the shrink function, which indicates the
- * cost to recreate an object relative to that of an LRU page.
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
  *
  * Returns the number of reclaimed slab objects.
  */
 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 				 struct mem_cgroup *memcg,
-				 unsigned long nr_scanned,
-				 unsigned long nr_eligible,
+				 int priority,
 				 bool for_drop_caches)
 {
 	struct shrinker *shrinker;
@@ -361,9 +350,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (memcg && !memcg_kmem_is_active(memcg))
 		return 0;
 
-	if (nr_scanned == 0)
-		nr_scanned = SWAP_CLUSTER_MAX;
-
 	if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
 		return 0;
 
@@ -392,7 +378,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 			sc.nid = 0;
 
-		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+		freed += do_shrink_slab(&sc, shrinker, priority);
 	}
 
 	up_read(&shrinker_rwsem);
@@ -411,7 +397,7 @@ void drop_slab_node(int nid)
 		freed = 0;
 		do {
 			freed += shrink_slab(GFP_KERNEL, nid, memcg,
-					     1000, 1000, true);
+					     0, true);
 		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 	} while (freed > 10);
 }
@@ -2388,14 +2374,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			scanned = sc->nr_scanned;
-
 			shrink_lruvec(lruvec, sc, &lru_pages);
 			zone_lru_pages += lru_pages;
 
 			if (memcg && is_classzone)
 				shrink_slab(slab_gfp, zone_to_nid(zone),
-					    memcg, sc->nr_scanned - scanned,
-					    lru_pages, false);
+					    memcg, sc->priority, false);
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2414,10 +2398,6 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 			}
 		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
-		/*
-		 * Shrink the slab caches in the same proportion that
-		 * the eligible LRU pages were scanned.
-		 */
 		if (global_reclaim(sc) && is_classzone) {
 			unsigned long scanned, eligible;
 
@@ -2439,7 +2419,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 			}
 
 			shrink_slab(slab_gfp, zone_to_nid(zone), NULL,
-				    scanned, eligible, false);
+				    sc->priority, false);
 		}
 
 		if (reclaim_state) {