[Devel] [PATCH vz9 2/5] ve/mm: add heuristic check for memory overcommit

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Wed Sep 29 21:25:21 MSK 2021


From: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

Previousely we only had the hard limit for virtual address space size
inside containers. This was inflexible, because setting it to a small
value will cause many normal allocations to fail. BTW that's why it is
left unlimited by default in Vz7. OTOH allowing an application to
allocate as much virtual address space as it wants may be bad for some
application expect to be stopped gracefully by mmap returning ENOMEM
instead of being killed by OOM.

So this patch introduces the "heuristic" mode of overcommit accounting
inside containers similar to the one used on most hosts by default
(vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
changing the value of vm.ve_overcommit_memory sysctl. Per-ve
configuration is not supported yet, but it may be added later if needed.

Then enabled (vm.ve_overcommit_memory = 0, this is the default), an
application inside a container will fail to allocate a virtual address
range if its length is greater than the amount of reclaimable memory
accounted to the container.

https://jira.sw.ru/browse/PSBM-45695
https://jira.sw.ru/browse/PSBM-48891

Rebase to vz8:

There are no beancounters now, so:
- Take amount of reclaimable memory used for allocation checks from
  current ve's memcg;
- Let's increment failcount for ve-root memory cgroup (memsw, as we use
  memsw.max as a base limit);
- Rename ubc.overcommit_memory to vm.ve_overcommit_memory;

https://jira.sw.ru/browse/PSBM-129225
(cherry-picked from vz7 commit 69ab7b978c6 ("ub: add heuristic check for
memory overcommit"))
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

+++
ve/mm: Honor changing per-memcg s[un]reclaimable counters to bytes in heuristic check for memory overcommit

RHEL8.4 has following ms commit backported:
d42f3245c7e2 ("mm: memcg: convert vmstat slab counters to bytes")

So, update places were we use per-memcg counters NR_SLAB_[UN]RECLAIMABLE_B
accordingly.

https://jira.sw.ru/browse/PSBM-132893

Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

(cherry-picked from vz8 commit 6627a6313e4e ("ve/mm: add heuristic check
for memory overcommit"))

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/linux/mm.h |  4 +++-
 kernel/sysctl.c    | 11 +++++++++++
 mm/memcontrol.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/util.c          |  3 +++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 41e81911cd9a..84112ca21dc8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -211,6 +211,9 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
 
+extern int sysctl_ve_overcommit_memory;
+extern int ve_enough_memory(long pages);
+
 int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
@@ -224,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
  */
 int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp, void **shadowp);
-
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 #else
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 563e44b26634..2f144293e6c5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2770,6 +2770,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname       = "ve_overcommit_memory",
+		.data           = &sysctl_ve_overcommit_memory,
+		.maxlen         = sizeof(sysctl_ve_overcommit_memory),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+#endif
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 41c3d5f25fba..a5b893a4721f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -65,6 +65,7 @@
 #include <linux/seq_buf.h>
 #include <linux/virtinfo.h>
 #include <linux/migrate.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -4091,6 +4092,45 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 	/* mi->locked = 0; */
 }
 
+int sysctl_ve_overcommit_memory __read_mostly;
+
+static int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.max - page_counter_read(&memcg->memory);
+
+	/* reclaimable slabs */
+	free += memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+
+	/* assume file cache is reclaimable */
+	free += memcg_page_state(memcg, NR_FILE_PAGES);
+
+	return free < pages ? -ENOMEM : 0;
+}
+
+int ve_enough_memory(long pages)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	int ret;
+
+	if (ve_is_super(ve) || sysctl_ve_overcommit_memory)
+		return 0;
+
+	css = ve_get_init_css(ve, memory_cgrp_id);
+	memcg = mem_cgroup_from_css(css);
+	ret = mem_cgroup_enough_memory(memcg, pages);
+
+	if (unlikely(ret < 0))
+		memcg->memsw.failcnt++;
+
+	css_put(css);
+	return ret;
+}
+
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
diff --git a/mm/util.c b/mm/util.c
index 9043d03750a7..7267ca1b54bf 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -873,6 +873,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	long allowed;
 
+	if (ve_enough_memory(pages))
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*
-- 
2.30.2



More information about the Devel mailing list