[Devel] [PATCH RHEL7 COMMIT] ms/mm: vmscan: fix IO/refault regression in cache workingset transition
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jul 13 18:41:24 MSK 2017
The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.6
------>
commit 107a8bf713c484ac3ef055afc165acf1f3c29d29
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date: Thu Jul 13 19:41:24 2017 +0400
ms/mm: vmscan: fix IO/refault regression in cache workingset transition
This is simplified (w/o memcg-awareness) version of the upstream commit bellow.
Simplified because we don't have cgroup-aware workingset yet, and backporting
it would be a lot of work.
commit 2a2e48854d704214dac7546e87ae0e4daa0e61a0
Author: Johannes Weiner <hannes at cmpxchg.org>
Date: Wed May 3 14:55:03 2017 -0700
mm: vmscan: fix IO/refault regression in cache workingset transition
Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
list") we noticed bigger IO spikes during changes in cache access
patterns.
The patch in question shrunk the inactive list size to leave more room
for the current workingset in the presence of streaming IO. However,
workingset transitions that previously happened on the inactive list are
now pushed out of memory and incur more refaults to complete.
This patch disables active list protection when refaults are being
observed. This accelerates workingset transitions, and allows more of
the new set to establish itself from memory, without eating into the
ability to protect the established workingset during stable periods.
The workloads that were measurably affected for us were hit pretty bad
by it, with refault/majfault rates doubling and tripling during cache
transitions, and the machines sustaining half-hour periods of 100% IO
utilization, where they'd previously have sub-minute peaks at 60-90%.
Stateful services that handle user data tend to be more conservative
with kernel upgrades. As a result we hit most page cache issues with
some delay, as was the case here.
The severity seemed to warrant a stable tag.
Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list")
Link: http://lkml.kernel.org/r/20170404220052.27593-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
Cc: Rik van Riel <riel at redhat.com>
Cc: Mel Gorman <mgorman at suse.de>
Cc: Michal Hocko <mhocko at suse.com>
Cc: Vladimir Davydov <vdavydov.dev at gmail.com>
Cc: <stable at vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
https://jira.sw.ru/browse/PSBM-68029
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
include/linux/mmzone.h | 3 +++
mm/vmscan.c | 52 ++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 386569f..e27fb5e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -430,6 +430,9 @@ struct zone {
*/
unsigned int inactive_ratio;
+ /* Refaults at the time of last reclaim cycle */
+ unsigned long refaults;
+
#ifdef CONFIG_MEMCG
bool force_scan;
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26e7620..b9e77c3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1824,6 +1824,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -1839,12 +1841,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 1TB 101 10GB
* 10TB 320 32GB
*/
-static int inactive_list_is_low(struct lruvec *lruvec, bool file)
+static int inactive_list_is_low(struct lruvec *lruvec, bool file,
+ struct mem_cgroup *memcg, bool actual_reclaim)
{
+ struct zone *zone = lruvec_zone(lruvec);
unsigned long inactive_ratio;
unsigned long inactive;
unsigned long active;
unsigned long gb;
+ unsigned long refaults;
/*
* If we don't have swap space, anonymous page deactivation
@@ -1856,12 +1861,20 @@ static int inactive_list_is_low(struct lruvec *lruvec, bool file)
inactive = get_lru_size(lruvec, file * LRU_FILE);
active = get_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
+ if (memcg)
+ refaults = zone->refaults; /* we don't support per-cgroup workingset */
+ else
+ refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+ if (file && actual_reclaim && zone->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+ }
return inactive * inactive_ratio < active;
}
@@ -1869,7 +1882,8 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
+ sc->target_mem_cgroup, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2034,7 +2048,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
- if (!inactive_list_is_low(lruvec, true) &&
+ if (!inactive_list_is_low(lruvec, true, sc->target_mem_cgroup, false) &&
get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
scan_balance = SCAN_FILE;
goto out;
@@ -2258,7 +2272,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false, sc->target_mem_cgroup, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2619,6 +2633,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool aborted_reclaim;
+ struct zone *zone;
+ struct zoneref *z;
retry:
{KSTAT_PERF_ENTER(ttfp);
@@ -2663,6 +2679,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
} while (--sc->priority >= 0 && !aborted_reclaim);
out:
+ if (!sc->target_mem_cgroup)
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask)
+ zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+
delayacct_freepages_end();
KSTAT_PERF_LEAVE(ttfp);}
@@ -2953,7 +2974,8 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false,
+ sc->target_mem_cgroup, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3338,6 +3360,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
!pgdat_balanced(pgdat, order, *classzone_idx));
out:
+
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+ }
+
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
More information about the Devel
mailing list