[Devel] [PATCH rh7 1/2] ms/mm: scale kswapd watermarks in proportion to memory
Andrey Ryabinin
aryabinin at virtuozzo.com
Fri May 17 16:52:06 MSK 2019
From: Johannes Weiner <hannes at cmpxchg.org>
In machines with 140G of memory and enterprise flash storage, we have
seen read and write bursts routinely exceed the kswapd watermarks and
cause thundering herds in direct reclaim. Unfortunately, the only way
to tune kswapd aggressiveness is through adjusting min_free_kbytes - the
system's emergency reserves - which is entirely unrelated to the
system's latency requirements. In order to get kswapd to maintain a
250M buffer of free memory, the emergency reserves need to be set to 1G.
That is a lot of memory wasted for no good reason.
On the other hand, it's reasonable to assume that allocation bursts and
overall allocation concurrency scale with memory capacity, so it makes
sense to make kswapd aggressiveness a function of that as well.
Change the kswapd watermark scale factor from the currently fixed 25% of
the tunable emergency reserve to a tunable 0.1% of memory.
Beyond 1G of memory, this will produce bigger watermark steps than the
current formula in default settings. Ensure that the new formula never
chooses steps smaller than that, i.e. 25% of the emergency reserve.
On a 140G machine, this raises the default watermark steps - the
distance between min and low, and low and high - from 16M to 143M.
Signed-off-by: Johannes Weiner <hannes at cmpxchg.org>
Acked-by: Mel Gorman <mgorman at suse.de>
Acked-by: Rik van Riel <riel at redhat.com>
Acked-by: David Rientjes <rientjes at google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim at lge.com>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
https://jira.sw.ru/browse/PSBM-94102
(cherry picked from commit 795ae7a0de6b834a0cc202aa55c190ef81496665)
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
include/linux/mm.h | 1 +
include/linux/mmzone.h | 2 ++
kernel/sysctl.c | 10 ++++++++++
mm/page_alloc.c | 29 +++++++++++++++++++++++++++--
5 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd869389291e..0708463bf737 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -782,6 +782,24 @@ never reclaim dentries and inodes due to memory pressure and this can easily
lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
causes the kernel to prefer to reclaim dentries and inodes.
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
==============================================================
zone_reclaim_mode:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bd1cef6e840..83a180536a16 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2002,6 +2002,7 @@ extern void zone_pcp_reset(struct zone *zone);
/* page_alloc.c */
extern int min_free_kbytes;
+extern int watermark_scale_factor;
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 70e925d41445..b0df91a6aceb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1015,6 +1015,8 @@ static inline int is_dma(struct zone *zone)
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 85e9d5e9be83..272b05541176 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,7 @@ static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -1488,6 +1489,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = min_free_kbytes_sysctl_handler,
.extra1 = &zero,
},
+ {
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
{
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 45da030d8da2..50b761c9ce10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,7 @@ static char * const zone_names[REAL_MAX_ZONES] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -6788,8 +6789,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) -
@@ -6867,6 +6877,21 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
--
2.21.0
More information about the Devel
mailing list