[Devel] [PATCH RH8] mm/ve: add heuristic check for memory overcommit

Tue Jun 1 16:42:38 MSK 2021

Previousely we only had the hard limit for virtual address space size
inside containers. This was inflexible, because setting it to a small
value will cause many normal allocations to fail. BTW that's why it is
left unlimited by default in Vz7. OTOH allowing an application to
allocate as much virtual address space as it wants may be bad for some
application expect to be stopped gracefully by mmap returning ENOMEM
instead of being killed by OOM.

So this patch introduces the "heuristic" mode of overcommit accounting
inside containers similar to the one used on most hosts by default
(vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
changing the value of vm.ve_overcommit_memory sysctl. Per-ve
configuration is not supported yet, but it may be added later if needed.

Then enabled (vm.ve_overcommit_memory = 0, this is the default), an
application inside a container will fail to allocate a virtual address
range if its length is greater than the amount of reclaimable memory
accounted to the container.

https://jira.sw.ru/browse/PSBM-45695
https://jira.sw.ru/browse/PSBM-48891

Rebase to vz8:

There are no beancounters now, so:
- Take amount of reclaimable memory used for allocation checks from
  current ve's memcg;
- Let's increment failcount for ve-root memory cgroup (memsw, as we use
  memsw.max as a base limit);
- Rename ubc.overcommit_memory to vm.ve_overcommit_memory;

https://jira.sw.ru/browse/PSBM-129225
(cherry-picked from vz7 commit 69ab7b978c6 ("ub: add heuristic check for
memory overcommit"))
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 include/linux/mm.h |  3 +++
 kernel/sysctl.c    | 11 +++++++++++
 mm/memcontrol.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/util.c          |  3 +++
 4 files changed, 57 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 807ecde29a42..1c5dedc78142 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -149,6 +149,9 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
 
+extern int sysctl_ve_overcommit_memory;
+extern int ve_enough_memory(long pages);
+
 extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
 				    size_t *, loff_t *);
 extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 346aa585dcf6..ddb998783f0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1324,6 +1324,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname	= "ve_overcommit_memory",
+		.data		= &sysctl_ve_overcommit_memory,
+		.maxlen		= sizeof(sysctl_ve_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 16c58d4e8660..597e1b863e01 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -70,6 +70,7 @@
 #include <linux/seq_buf.h>
 #include <linux/virtinfo.h>
 #include <linux/migrate.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -3866,6 +3867,45 @@ void mem_cgroup_fill_vmstat(struct mem_cgroup *memcg, unsigned long *stats)
 #endif
 }
 
+int sysctl_ve_overcommit_memory __read_mostly;
+
+static int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.max - page_counter_read(&memcg->memory);
+
+	/* reclaimable slabs */
+	free += memcg_page_state(memcg, NR_SLAB_RECLAIMABLE);
+
+	/* assume file cache is reclaimable */
+	free += memcg_page_state(memcg, MEMCG_CACHE);
+
+	return free < pages ? -ENOMEM : 0;
+}
+
+int ve_enough_memory(long pages)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	int ret;
+
+	if (ve_is_super(ve) || sysctl_ve_overcommit_memory)
+		return 0;
+
+	css = ve_get_init_css(ve, memory_cgrp_id);
+	memcg = mem_cgroup_from_css(css);
+	ret = mem_cgroup_enough_memory(memcg, pages);
+
+	if (unlikely(ret < 0))
+		memcg->memsw.failcnt++;
+
+	css_put(css);
+	return ret;
+}
+
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
diff --git a/mm/util.c b/mm/util.c
index e99de9d3c8ae..6c7a0841c0b4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -709,6 +709,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			-(s64)vm_committed_as_batch * num_online_cpus(),
 			"memory commitment underflow");
 
+	if (ve_enough_memory(pages))
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*
-- 
2.31.1