[Devel] [PATCH RHEL7 COMMIT] mm: vmscan: fix the page state calculation in too_many_isolated

Mon Sep 7 03:15:45 PDT 2015

The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.6.7
------>
commit 064aaf330044a444a4a9031c4826173c22e700a2
Author: Vinayak Menon <vinmenon at codeaurora.org>
Date:   Mon Sep 7 14:15:45 2015 +0400

    mm: vmscan: fix the page state calculation in too_many_isolated
    
    It is observed that sometimes multiple tasks get blocked for long in the
    congestion_wait loop below, in shrink_inactive_list.  This is because of
    vm_stat values not being synced.
    
    (__schedule) from [<c0a03328>]
    (schedule_timeout) from [<c0a04940>]
    (io_schedule_timeout) from [<c01d585c>]
    (congestion_wait) from [<c01cc9d8>]
    (shrink_inactive_list) from [<c01cd034>]
    (shrink_zone) from [<c01cdd08>]
    (try_to_free_pages) from [<c01c442c>]
    (__alloc_pages_nodemask) from [<c01f1884>]
    (new_slab) from [<c09fcf60>]
    (__slab_alloc) from [<c01f1a6c>]
    
    In one such instance, zone_page_state(zone, NR_ISOLATED_FILE) had returned
    14, zone_page_state(zone, NR_INACTIVE_FILE) returned 92, and GFP_IOFS was
    set, and this resulted in too_many_isolated returning true.  But one of
    the CPU's pageset vm_stat_diff had NR_ISOLATED_FILE as "-14".  So the
    actual isolated count was zero.  As there weren't any more updates to
    NR_ISOLATED_FILE and vmstat_update deffered work had not been scheduled
    yet, 7 tasks were spinning in the congestion wait loop for around 4
    seconds, in the direct reclaim path.
    
    This patch uses zone_page_state_snapshot instead, but restricts its usage
    to avoid performance penalty.
    
    The vmstat sync interval is HZ (sysctl_stat_interval), but since the
    vmstat_work is declared as a deferrable work, the timer trigger can be
    deferred to the next non-defferable timer expiry on the CPU which is in
    idle.  This results in the vmstat syncing on an idle CPU being delayed by
    seconds.  May be in most cases this behavior is fine, except in cases like
    this.
    
    [akpm at linux-foundation.org: move zone_page_state_snapshot() fallback logic into too_many_isolated()]
    Signed-off-by: Vinayak Menon <vinmenon at codeaurora.org>
    Cc: Johannes Weiner <hannes at cmpxchg.org>
    Cc: Vladimir Davydov <vdavydov at parallels.com>
    Cc: Mel Gorman <mgorman at suse.de>
    Cc: Minchan Kim <minchan at kernel.org>
    Cc: Michal Hocko <mhocko at suse.cz>
    Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
    
    (cherry picked from commit 67df1eb62b35ed132075b52f2e52a582e2df149f)
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    Link: https://jira.sw.ru/browse/PSBM-35155
    Reviewed-by: Kirill Tkhai <ktkhai at odin.com>
---
 mm/vmscan.c | 51 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0ea28cbcc1f7..e0f4e2948b30 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1376,6 +1376,32 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+static int __too_many_isolated(struct zone *zone, int file,
+			       struct scan_control *sc, int safe)
+{
+	unsigned long inactive, isolated;
+
+	if (safe) {
+		inactive = zone_page_state_snapshot(zone,
+				NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state_snapshot(zone,
+				NR_ISOLATED_ANON + file);
+	} else {
+		inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+	}
+
+	/*
+	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+	 * won't get blocked by normal direct-reclaimers, forming a circular
+	 * deadlock.
+	 */
+	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+		inactive >>= 3;
+
+	return isolated > inactive;
+}
+
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
@@ -1384,33 +1410,24 @@ int isolate_lru_page(struct page *page)
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
-		struct scan_control *sc)
+			     struct scan_control *sc)
 {
-	unsigned long inactive, isolated;
-
 	if (current_is_kswapd())
 		return 0;
 
 	if (!global_reclaim(sc))
 		return 0;
 
-	if (file) {
-		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
-	} else {
-		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
-	}
-
 	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
+	 * __too_many_isolated(safe=0) is fast but inaccurate, because it
+	 * doesn't account for the vm_stat_diff[] counters.  So if it looks
+	 * like too_many_isolated() is about to return true, fall back to the
+	 * slower, more accurate zone_page_state_snapshot().
 	 */
-	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
-		inactive >>= 3;
+	if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+		return __too_many_isolated(zone, file, sc, 1);
 
-	return isolated > inactive;
+	return 0;
 }
 
 static noinline_for_stack void