[Devel] [PATCH RHEL8 COMMIT] mm/vmscan: shrink tcache upfront everything else

Thu Apr 2 16:35:31 MSK 2020

The commit is pushed to "branch-rh8-4.18.0-80.1.2.vz8.3.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-80.1.2.vz8.3.4
------>
commit fab68564f8c6252b213acdbc405b0fd4eaa9d3b0
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Thu Apr 2 16:35:30 2020 +0300

    mm/vmscan: shrink tcache upfront everything else
    
    We don't want to evict page cache or anon to swap while
    there are a lot of reclaimable pages in tcache. Reclaim it first,
    and only after that reclaim the rest if still required
    
    Notes:
     1) Keep tcache generic shrinkers so if new tcache are generated
     heavily, background kswapd thread does not forget to shrink tcache.
    
     2) in shrink_tcache() we don't break for_each_node_mask() cycle even
     in case shrinking first node gives us enough nr_reclaimed.
     We want to make similar memory pressure on all nodes and not to trash
     only the first one and stop.
    
    https://jira.sw.ru/browse/PSBM-89403
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 mm/internal.h | 29 +++++++++++++++++++++++++++++
 mm/tcache.c   |  8 ++++----
 mm/vmscan.c   | 43 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9e3654d70289..eec4fc63cf4b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -507,6 +507,35 @@ struct tlbflush_unmap_batch;
  */
 extern struct workqueue_struct *mm_percpu_wq;
 
+#ifdef CONFIG_TCACHE
+unsigned long tcache_shrink_scan(struct shrinker *shrinker,
+			struct shrink_control *sc);
+unsigned long tcache_shrink_count(struct shrinker *shrink,
+				struct shrink_control *sc);
+
+static inline unsigned long shrink_tcache_node(struct shrink_control *sc)
+{
+	unsigned long ret;
+	extern bool tcache_enabled;
+
+	if (!READ_ONCE(tcache_enabled))
+		return 0;
+
+	ret = tcache_shrink_count(NULL, sc);
+	if (!ret)
+		return ret;
+
+	ret = tcache_shrink_scan(NULL, sc);
+	if (ret == SHRINK_STOP)
+		ret = 0;
+	return ret;
+}
+#else
+static inline unsigned long tcache_shrink_node(struct shrink_control *sc)
+{ return 0; }
+#endif
+
+
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
diff --git a/mm/tcache.c b/mm/tcache.c
index 6660687e3871..c7c5008fdac8 100644
--- a/mm/tcache.c
+++ b/mm/tcache.c
@@ -170,7 +170,7 @@ static struct tcache_nodeinfo *tcache_nodeinfo;
  */
 
 /* Enable/disable tcache backend (set at boot time) */
-static bool tcache_enabled __read_mostly = true;
+bool tcache_enabled __read_mostly = true;
 module_param_named(enabled, tcache_enabled, bool, 0444);
 
 /* Enable/disable populating the cache */
@@ -1176,7 +1176,7 @@ static struct page *tcache_alloc_page(struct tcache_pool *pool)
 	return page;
 }
 
-static unsigned long tcache_shrink_count(struct shrinker *shrink,
+unsigned long tcache_shrink_count(struct shrinker *shrink,
 					 struct shrink_control *sc)
 {
 	atomic_long_t *nr_pages = &tcache_nodeinfo[sc->nid].nr_pages;
@@ -1190,13 +1190,13 @@ static unsigned long tcache_shrink_count(struct shrinker *shrink,
 #define TCACHE_SCAN_BATCH 128UL
 static DEFINE_PER_CPU(struct page * [TCACHE_SCAN_BATCH], tcache_page_vec);
 
-static unsigned long tcache_shrink_scan(struct shrinker *shrink,
+unsigned long tcache_shrink_scan(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
 	struct page **pages = get_cpu_var(tcache_page_vec);
 	int nr_isolated, nr_reclaimed;
 
-	if (WARN_ON(sc->nr_to_scan > TCACHE_SCAN_BATCH))
+	if (sc->nr_to_scan > TCACHE_SCAN_BATCH)
 		sc->nr_to_scan = TCACHE_SCAN_BATCH;
 
 	nr_isolated = tcache_lru_isolate(sc->nid, pages, sc->nr_to_scan);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 123dfa493fd9..d99fb2be1c36 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2800,6 +2800,35 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
 	target_lruvec->refaults = refaults;
 }
 
+static void shrink_tcache(struct scan_control *scan_ctrl)
+{
+	int nid;
+	unsigned long shrunk;
+	nodemask_t *nodemask = scan_ctrl->nodemask ? : &node_online_map;
+
+	do {
+		shrunk = 0;
+
+		for_each_node_mask(nid, *nodemask) {
+			struct shrink_control sc = {
+				.gfp_mask = scan_ctrl->gfp_mask,
+				.nid = nid,
+				.memcg = NULL,
+				.nr_to_scan = scan_ctrl->nr_to_reclaim -
+					      scan_ctrl->nr_reclaimed,
+			};
+			shrunk = shrink_tcache_node(&sc);
+			scan_ctrl->nr_reclaimed += shrunk;
+			/*
+			 * We scan all nodes even if we reclaim more than
+			 * nr_to_reclaim, we want to make similar memory
+			 * pressure on all nodes and not to trash only the
+			 * first one and stop.
+			 */
+		}
+	} while (shrunk && scan_ctrl->nr_reclaimed < scan_ctrl->nr_to_reclaim);
+}
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -2826,8 +2855,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 retry:
 	delayacct_freepages_start();
 
-	if (!cgroup_reclaim(sc))
+	if (!cgroup_reclaim(sc)) {
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
+		shrink_tcache(sc);
+		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
+			goto out;
+	}
 
 	do {
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2866,7 +2899,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
 		}
 	}
-
+out:
 	delayacct_freepages_end();
 
 	if (sc->nr_reclaimed)
@@ -3393,6 +3426,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 						sc.gfp_mask, &nr_soft_scanned);
 		sc.nr_reclaimed += nr_soft_reclaimed;
 
+		shrink_tcache(&sc);
+		if (sc.nr_reclaimed >= sc.nr_to_reclaim &&
+		    pgdat_balanced(pgdat, order, classzone_idx))
+			goto out;
+
+
 		/*
 		 * There should be no need to raise the scanning priority if
 		 * enough pages are already being scanned that that high