[Devel] [PATCH RHEL7 COMMIT] mm/vmscan: Use per-zone sum of reclaim_stat to change zone state.

Wed Jan 31 18:29:34 MSK 2018

The commit is pushed to "branch-rh7-3.10.0-693.11.6.vz7.42.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.11.6.vz7.42.4
------>
commit 6254b06fdfb6656bd5f503a4bfc9d42db67e248d
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Wed Jan 31 18:29:34 2018 +0300

    mm/vmscan: Use per-zone sum of reclaim_stat to change zone state.
    
    Currently we collect reclaim stats per-lru list and set zone
    flags based on these stats. This seems wrong, as lrus a per-memcg
    thus one zone could have hundreds of them.
    
    Move all that zone-related logic from shrink_inactive_list() to
    shrink_zone, and make decisions based on per-zone sum of reclaim stat
    instead of just per-lru.
    
    https://jira.sw.ru/browse/PSBM-61409
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/vmscan.c | 109 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 54 insertions(+), 55 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e6dde1e15a54..d71fa15a1750 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1628,61 +1628,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		sc->stat->nr_immediate += stat.nr_immediate;
 	}
 
-	/*
-	 * If reclaim is isolating dirty pages under writeback, it implies
-	 * that the long-lived page allocation rate is exceeding the page
-	 * laundering rate. Either the global limits are not being effective
-	 * at throttling processes due to the page distribution throughout
-	 * zones or there is heavy usage of a slow backing device. The
-	 * only option is to throttle from reclaim context which is not ideal
-	 * as there is no guarantee the dirtying process is throttled in the
-	 * same way balance_dirty_pages() manages.
-	 *
-	 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
-	 * of pages under pages flagged for immediate reclaim and stall if any
-	 * are encountered in the nr_immediate check below.
-	 */
-	if (stat.nr_writeback && stat.nr_writeback == nr_taken)
-		zone_set_flag(zone, ZONE_WRITEBACK);
-
-	if (!global_reclaim(sc) && stat.nr_immediate)
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
-
-	if (sane_reclaim(sc)) {
-		/*
-		 * Tag a zone as congested if all the dirty pages scanned were
-		 * backed by a congested BDI and wait_iff_congested will stall.
-		 */
-		if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
-			zone_set_flag(zone, ZONE_CONGESTED);
-
-		/*
-		 * If dirty pages are scanned that are not queued for IO, it
-		 * implies that flushers are not keeping up. In this case, flag
-		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
-		 * pages from reclaim context.
-		 */
-		if (stat.nr_unqueued_dirty == nr_taken)
-			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
-
-		/*
-		 * If kswapd scans pages marked marked for immediate
-		 * reclaim and under writeback (nr_immediate), it implies
-		 * that pages are cycling through the LRU faster than
-		 * they are written so also forcibly stall.
-		 */
-		if (stat.nr_immediate)
-			congestion_wait(BLK_RW_ASYNC, HZ/10);
-	}
-
-	/*
-	 * Stall direct reclaim for IO completions if underlying BDIs or zone
-	 * is congested. Allow kswapd to continue until it starts encountering
-	 * unqueued dirty pages or cycling through the LRU too quickly.
-	 */
-	if (!sc->hibernation_mode && !current_is_kswapd())
-		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
-
 	trace_mm_vmscan_lru_shrink_inactive(zone_to_nid(zone), zone_idx(zone),
 			nr_scanned, nr_reclaimed,
 			stat.nr_dirty,  stat.nr_writeback,
@@ -2485,6 +2430,60 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
 			shrink_slab(slab_gfp, zone_to_nid(zone), NULL,
 				    sc->priority, false);
 
+		if (global_reclaim(sc)) {
+			/*
+			 * If reclaim is isolating dirty pages under writeback, it implies
+			 * that the long-lived page allocation rate is exceeding the page
+			 * laundering rate. Either the global limits are not being effective
+			 * at throttling processes due to the page distribution throughout
+			 * zones or there is heavy usage of a slow backing device. The
+			 * only option is to throttle from reclaim context which is not ideal
+			 * as there is no guarantee the dirtying process is throttled in the
+			 * same way balance_dirty_pages() manages.
+			 *
+			 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
+			 * of pages under pages flagged for immediate reclaim and stall if any
+			 * are encountered in the nr_immediate check below.
+			 */
+			if (stat.nr_writeback && stat.nr_writeback == stat.nr_taken)
+				zone_set_flag(zone, ZONE_WRITEBACK);
+
+			/*
+			 * Tag a zone as congested if all the dirty pages scanned were
+			 * backed by a congested BDI and wait_iff_congested will stall.
+			 */
+			if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
+				zone_set_flag(zone, ZONE_CONGESTED);
+			/*
+			 * If dirty pages are scanned that are not queued for IO, it
+			 * implies that flushers are not keeping up. In this case, flag
+			 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+			 * pages from reclaim context.
+			 */
+			if (stat.nr_unqueued_dirty == stat.nr_taken)
+				zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+
+			/*
+			 * If kswapd scans pages marked marked for immediate
+			 * reclaim and under writeback (nr_immediate), it implies
+			 * that pages are cycling through the LRU faster than
+			 * they are written so also forcibly stall.
+			 */
+			if (stat.nr_immediate)
+				congestion_wait(BLK_RW_ASYNC, HZ/10);
+		}
+
+		if (!global_reclaim(sc) && stat.nr_immediate)
+			congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+		/*
+		 * Stall direct reclaim for IO completions if underlying BDIs or zone
+		 * is congested. Allow kswapd to continue until it starts encountering
+		 * unqueued dirty pages or cycling through the LRU too quickly.
+		 */
+		if (!sc->hibernation_mode && !current_is_kswapd())
+			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
 		if (reclaim_state) {
 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;