[Devel] [PATCH RHEL7 COMMIT] mm: Port diff-mm-add-sysctl-vm.force_scan_thresh

Konstantin Khorenko khorenko at virtuozzo.com
Fri Jun 5 02:21:16 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.9
------>
commit 70f77c1ffea46c9b7336002dcd83b0470fbb2510
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Fri Jun 5 13:21:16 2015 +0400

    mm: Port diff-mm-add-sysctl-vm.force_scan_thresh
    
    Author: Vladimir Davydov
    Email: vdavydov at parallels.com
    Subject: mm: add sysctl vm.force_scan_thresh
    Date: Fri, 14 Nov 2014 19:50:10 +0300
    
    If an lru list is so small that we can't scan anything from it on the
    current scan prio, we skip it. Therefore if all lru lists are small, we
    will be looping over them until we descend to an appropriate scan prio,
    which may be quite cpu consuming. Although this isn't relevant for VZ
    setups, where we usually have a relatively small amount of medium-sized
    containers, it becomes really painful on CL setups with thousands of
    very small (several hundred of kilobytes) LVEs.
    
    It's not obvious how we could solve this problem w/o reworking the scan
    pressure distribution algorithm, so this patch adds a workaround. It
    introduces new sysctl knob vm.force_scan_thresh. If the share of tiny
    lrus (i.e. those that can't be scanned on the default priority) is
    greater than the percentage defined by the sysctl, both kswapd and
    direct reclaim path will be forced to scan lru even if it doesn't have
    enough pages to be scanned on the current prio. The default value is set
    to 50.
    
    https://jira.sw.ru/browse/PSBM-29968
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Acked-by: Andrey Vagin <avagin at parallels.com>
    =============================================================================
    
    Related to https://jira.sw.ru/browse/PSBM-33640
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 include/linux/mmzone.h |  4 ++++
 include/linux/swap.h   |  4 ++++
 kernel/sysctl.c        | 11 +++++++++++
 mm/vmscan.c            | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmstat.c            |  6 ++++--
 5 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 097393d..21dc3a4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,6 +414,10 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+#ifdef CONFIG_MEMCG
+	bool force_scan;
+#endif
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12a0433..9c32bcd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,6 +330,10 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_MEMCG
+extern int sysctl_force_scan_thresh;
+#endif
+
 extern int page_evictable(struct page *page);
 extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dde6ac..fe20216 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1525,6 +1525,17 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+#ifdef CONFIG_MEMCG
+	{
+		.procname	= "force_scan_thresh",
+		.data		= &sysctl_force_scan_thresh,
+		.maxlen		= sizeof(sysctl_force_scan_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 	{ }
 };
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 836bdb3..39fd2eb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1873,6 +1873,51 @@ static int vmscan_swappiness(struct scan_control *sc)
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+#ifdef CONFIG_MEMCG
+int sysctl_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return zone->force_scan;
+}
+
+static void zone_update_force_scan(struct zone *zone)
+{
+	struct mem_cgroup *memcg;
+	int tiny, total;
+
+	tiny = total = 0;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+		unsigned long size;
+
+		size = max(get_lru_size(lruvec, LRU_ACTIVE_FILE),
+			   get_lru_size(lruvec, LRU_INACTIVE_FILE));
+		if (get_nr_swap_pages() > 0)
+			size = max3(size,
+				    get_lru_size(lruvec, LRU_ACTIVE_ANON),
+				    get_lru_size(lruvec, LRU_INACTIVE_ANON));
+
+		if (size && size >> DEF_PRIORITY == 0)
+			tiny++;
+		total++;
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	zone->force_scan = tiny * 100 > total * sysctl_force_scan_thresh;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return false;
+}
+
+static inline void zone_update_force_scan(struct zone *zone)
+{
+}
+#endif
+
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
@@ -1917,6 +1962,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
+	if (zone_force_scan(zone))
+		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
@@ -3031,6 +3078,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
+			zone_update_force_scan(zone);
+
 			if (zone->all_unreclaimable &&
 			    sc.priority != DEF_PRIORITY)
 				continue;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c4144bf..a42b71b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1060,10 +1060,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
+		   "\n  inactive_ratio:    %u"
+		   "\n  force_scan:        %d",
 		   zone->all_unreclaimable,
 		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   zone->inactive_ratio,
+		   zone->force_scan);
 	seq_putc(m, '\n');
 }
 



More information about the Devel mailing list