[Devel] [PATCH rh7 2/2] ms/mm: vmscan: fix IO/refault regression in cache workingset transition

Wed Jul 12 18:01:25 MSK 2017

This is simplified (w/o memcg-awareness) version of the upstream commit bellow.
Simplified because we don't have cgroup-aware workingset yet, and backporting
it would be a lot of work.

commit 2a2e48854d704214dac7546e87ae0e4daa0e61a0
Author: Johannes Weiner <hannes at cmpxchg.org>
Date:   Wed May 3 14:55:03 2017 -0700

    mm: vmscan: fix IO/refault regression in cache workingset transition

    Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
    list") we noticed bigger IO spikes during changes in cache access
    patterns.

    The patch in question shrunk the inactive list size to leave more room
    for the current workingset in the presence of streaming IO.  However,
    workingset transitions that previously happened on the inactive list are
    now pushed out of memory and incur more refaults to complete.

    This patch disables active list protection when refaults are being
    observed.  This accelerates workingset transitions, and allows more of
    the new set to establish itself from memory, without eating into the
    ability to protect the established workingset during stable periods.

    The workloads that were measurably affected for us were hit pretty bad
    by it, with refault/majfault rates doubling and tripling during cache
    transitions, and the machines sustaining half-hour periods of 100% IO
    utilization, where they'd previously have sub-minute peaks at 60-90%.

    Stateful services that handle user data tend to be more conservative
    with kernel upgrades.  As a result we hit most page cache issues with
    some delay, as was the case here.

    The severity seemed to warrant a stable tag.

    Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list")
    Link: http://lkml.kernel.org/r/20170404220052.27593-1-hannes@cmpxchg.org
    Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
    Cc: Rik van Riel <riel at redhat.com>
    Cc: Mel Gorman <mgorman at suse.de>
    Cc: Michal Hocko <mhocko at suse.com>
    Cc: Vladimir Davydov <vdavydov.dev at gmail.com>
    Cc: <stable at vger.kernel.org>    [4.7+]
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>

https://jira.sw.ru/browse/PSBM-68029

Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/linux/mmzone.h |  3 +++
 mm/vmscan.c            | 52 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 386569ffdf7..e27fb5e683a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -430,6 +430,9 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+	/* Refaults at the time of last reclaim cycle */
+	unsigned long			refaults;
+
 #ifdef CONFIG_MEMCG
 	bool force_scan;
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26e76201b3f..b9e77c303fc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1824,6 +1824,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -1839,12 +1841,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static int inactive_list_is_low(struct lruvec *lruvec, bool file)
+static int inactive_list_is_low(struct lruvec *lruvec, bool file,
+				struct mem_cgroup *memcg, bool actual_reclaim)
 {
+	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
+	unsigned long refaults;
 
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -1856,12 +1861,20 @@ static int inactive_list_is_low(struct lruvec *lruvec, bool file)
 	inactive = get_lru_size(lruvec, file * LRU_FILE);
 	active = get_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
+	if (memcg)
+		refaults = zone->refaults; /* we don't support per-cgroup workingset */
+        else
+		refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
 
+	if (file && actual_reclaim && zone->refaults != refaults) {
+		inactive_ratio = 0;
+	} else {
+		gb = (inactive + active) >> (30 - PAGE_SHIFT);
+		if (gb)
+			inactive_ratio = int_sqrt(10 * gb);
+		else
+			inactive_ratio = 1;
+	}
 	return inactive * inactive_ratio < active;
 }
 
@@ -1869,7 +1882,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru),
+					sc->target_mem_cgroup, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2034,7 +2048,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * There is enough inactive page cache, do not reclaim
 	 * anything from the anonymous working set right now.
 	 */
-	if (!inactive_list_is_low(lruvec, true) &&
+	if (!inactive_list_is_low(lruvec, true, sc->target_mem_cgroup, false) &&
 	    get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2258,7 +2272,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_list_is_low(lruvec, false))
+	if (inactive_list_is_low(lruvec, false, sc->target_mem_cgroup, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
@@ -2619,6 +2633,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
+	struct zone *zone;
+	struct zoneref *z;
 
 retry:
 	{KSTAT_PERF_ENTER(ttfp);
@@ -2663,6 +2679,11 @@ retry:
 	} while (--sc->priority >= 0 && !aborted_reclaim);
 
 out:
+	if (!sc->target_mem_cgroup)
+		for_each_zone_zonelist_nodemask(zone, z, zonelist,
+					gfp_zone(sc->gfp_mask), sc->nodemask)
+			zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+
 	delayacct_freepages_end();
 	KSTAT_PERF_LEAVE(ttfp);}
 
@@ -2953,7 +2974,8 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_list_is_low(lruvec, false))
+		if (inactive_list_is_low(lruvec, false,
+					sc->target_mem_cgroup, true))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
@@ -3338,6 +3360,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 		 !pgdat_balanced(pgdat, order, *classzone_idx));
 
 out:
+
+	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+		struct zone *zone = pgdat->node_zones + i;
+
+		if (!populated_zone(zone))
+			continue;
+
+		zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+	}
+
 	/*
 	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 	 * makes a decision on the order we were last reclaiming at. However,
-- 
2.13.0