[Devel] [PATCH rh7] Port diff-mm-add-sysctl-vm.force_scan_thresh

Vladimir Davydov vdavydov at parallels.com
Tue Jun 2 08:39:51 PDT 2015


Author: Vladimir Davydov
Email: vdavydov at parallels.com
Subject: mm: add sysctl vm.force_scan_thresh
Date: Fri, 14 Nov 2014 19:50:10 +0300

If an lru list is so small that we can't scan anything from it on the
current scan prio, we skip it. Therefore if all lru lists are small, we
will be looping over them until we descend to an appropriate scan prio,
which may be quite cpu consuming. Although this isn't relevant for VZ
setups, where we usually have a relatively small amount of medium-sized
containers, it becomes really painful on CL setups with thousands of
very small (several hundred of kilobytes) LVEs.

It's not obvious how we could solve this problem w/o reworking the scan
pressure distribution algorithm, so this patch adds a workaround. It
introduces new sysctl knob vm.force_scan_thresh. If the share of tiny
lrus (i.e. those that can't be scanned on the default priority) is
greater than the percentage defined by the sysctl, both kswapd and
direct reclaim path will be forced to scan lru even if it doesn't have
enough pages to be scanned on the current prio. The default value is set
to 50.

https://jira.sw.ru/browse/PSBM-29968

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Acked-by: Andrey Vagin <avagin at parallels.com>
=============================================================================

Related to https://jira.sw.ru/browse/PSBM-33640

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/mmzone.h |  4 ++++
 include/linux/swap.h   |  4 ++++
 kernel/sysctl.c        | 11 +++++++++++
 mm/vmscan.c            | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmstat.c            |  6 ++++--
 5 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 097393d0bcae..21dc3a44179a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,6 +414,10 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+#ifdef CONFIG_MEMCG
+	bool force_scan;
+#endif
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12a04334acbf..9c32bcddffe7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,6 +330,10 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_MEMCG
+extern int sysctl_force_scan_thresh;
+#endif
+
 extern int page_evictable(struct page *page);
 extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dde6acc2820..fe202160e38b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1525,6 +1525,17 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+#ifdef CONFIG_MEMCG
+	{
+		.procname	= "force_scan_thresh",
+		.data		= &sysctl_force_scan_thresh,
+		.maxlen		= sizeof(sysctl_force_scan_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 	{ }
 };
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 836bdb352160..39fd2eb5749d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1873,6 +1873,51 @@ static int vmscan_swappiness(struct scan_control *sc)
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+#ifdef CONFIG_MEMCG
+int sysctl_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return zone->force_scan;
+}
+
+static void zone_update_force_scan(struct zone *zone)
+{
+	struct mem_cgroup *memcg;
+	int tiny, total;
+
+	tiny = total = 0;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+		unsigned long size;
+
+		size = max(get_lru_size(lruvec, LRU_ACTIVE_FILE),
+			   get_lru_size(lruvec, LRU_INACTIVE_FILE));
+		if (get_nr_swap_pages() > 0)
+			size = max3(size,
+				    get_lru_size(lruvec, LRU_ACTIVE_ANON),
+				    get_lru_size(lruvec, LRU_INACTIVE_ANON));
+
+		if (size && size >> DEF_PRIORITY == 0)
+			tiny++;
+		total++;
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	zone->force_scan = tiny * 100 > total * sysctl_force_scan_thresh;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return false;
+}
+
+static inline void zone_update_force_scan(struct zone *zone)
+{
+}
+#endif
+
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
@@ -1917,6 +1962,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
+	if (zone_force_scan(zone))
+		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
@@ -3031,6 +3078,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
+			zone_update_force_scan(zone);
+
 			if (zone->all_unreclaimable &&
 			    sc.priority != DEF_PRIORITY)
 				continue;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c4144bf00f21..a42b71b72181 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1060,10 +1060,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
+		   "\n  inactive_ratio:    %u"
+		   "\n  force_scan:        %d",
 		   zone->all_unreclaimable,
 		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   zone->inactive_ratio,
+		   zone->force_scan);
 	seq_putc(m, '\n');
 }
 
-- 
2.1.4




More information about the Devel mailing list