[Devel] [PATCH RHEL7 COMMIT] mm: Port diff-mm-add-sysctl-vm.force_scan_thresh
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Jun 5 02:21:16 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.9
------>
commit 70f77c1ffea46c9b7336002dcd83b0470fbb2510
Author: Vladimir Davydov <vdavydov at parallels.com>
Date: Fri Jun 5 13:21:16 2015 +0400
mm: Port diff-mm-add-sysctl-vm.force_scan_thresh
Author: Vladimir Davydov
Email: vdavydov at parallels.com
Subject: mm: add sysctl vm.force_scan_thresh
Date: Fri, 14 Nov 2014 19:50:10 +0300
If an lru list is so small that we can't scan anything from it on the
current scan prio, we skip it. Therefore if all lru lists are small, we
will be looping over them until we descend to an appropriate scan prio,
which may be quite cpu consuming. Although this isn't relevant for VZ
setups, where we usually have a relatively small amount of medium-sized
containers, it becomes really painful on CL setups with thousands of
very small (several hundred of kilobytes) LVEs.
It's not obvious how we could solve this problem w/o reworking the scan
pressure distribution algorithm, so this patch adds a workaround. It
introduces new sysctl knob vm.force_scan_thresh. If the share of tiny
lrus (i.e. those that can't be scanned on the default priority) is
greater than the percentage defined by the sysctl, both kswapd and
direct reclaim path will be forced to scan lru even if it doesn't have
enough pages to be scanned on the current prio. The default value is set
to 50.
https://jira.sw.ru/browse/PSBM-29968
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Acked-by: Andrey Vagin <avagin at parallels.com>
=============================================================================
Related to https://jira.sw.ru/browse/PSBM-33640
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
include/linux/mmzone.h | 4 ++++
include/linux/swap.h | 4 ++++
kernel/sysctl.c | 11 +++++++++++
mm/vmscan.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
mm/vmstat.c | 6 ++++--
5 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 097393d..21dc3a4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,6 +414,10 @@ struct zone {
*/
unsigned int inactive_ratio;
+#ifdef CONFIG_MEMCG
+ bool force_scan;
+#endif
+
ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12a0433..9c32bcd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,6 +330,10 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
}
#endif
+#ifdef CONFIG_MEMCG
+extern int sysctl_force_scan_thresh;
+#endif
+
extern int page_evictable(struct page *page);
extern void check_move_unevictable_pages(struct page **, int nr_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dde6ac..fe20216 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1525,6 +1525,17 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_MEMCG
+ {
+ .procname = "force_scan_thresh",
+ .data = &sysctl_force_scan_thresh,
+ .maxlen = sizeof(sysctl_force_scan_thresh),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
{ }
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 836bdb3..39fd2eb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1873,6 +1873,51 @@ static int vmscan_swappiness(struct scan_control *sc)
return mem_cgroup_swappiness(sc->target_mem_cgroup);
}
+#ifdef CONFIG_MEMCG
+int sysctl_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+ return zone->force_scan;
+}
+
+static void zone_update_force_scan(struct zone *zone)
+{
+ struct mem_cgroup *memcg;
+ int tiny, total;
+
+ tiny = total = 0;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ unsigned long size;
+
+ size = max(get_lru_size(lruvec, LRU_ACTIVE_FILE),
+ get_lru_size(lruvec, LRU_INACTIVE_FILE));
+ if (get_nr_swap_pages() > 0)
+ size = max3(size,
+ get_lru_size(lruvec, LRU_ACTIVE_ANON),
+ get_lru_size(lruvec, LRU_INACTIVE_ANON));
+
+ if (size && size >> DEF_PRIORITY == 0)
+ tiny++;
+ total++;
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ zone->force_scan = tiny * 100 > total * sysctl_force_scan_thresh;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+ return false;
+}
+
+static inline void zone_update_force_scan(struct zone *zone)
+{
+}
+#endif
+
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
@@ -1917,6 +1962,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
+ if (zone_force_scan(zone))
+ force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
@@ -3031,6 +3078,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone))
continue;
+ zone_update_force_scan(zone);
+
if (zone->all_unreclaimable &&
sc.priority != DEF_PRIORITY)
continue;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c4144bf..a42b71b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1060,10 +1060,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m,
"\n all_unreclaimable: %u"
"\n start_pfn: %lu"
- "\n inactive_ratio: %u",
+ "\n inactive_ratio: %u"
+ "\n force_scan: %d",
zone->all_unreclaimable,
zone->zone_start_pfn,
- zone->inactive_ratio);
+ zone->inactive_ratio,
+ zone->force_scan);
seq_putc(m, '\n');
}
More information about the Devel
mailing list