[Devel] [PATCH RHEL8 COMMIT] ve/mm: add heuristic check for memory overcommit

Mon Jun 7 18:55:25 MSK 2021

The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.35
------>
commit 52e1ca1601e9a9391d4bba00914a4a4d2e945f22
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Mon Jun 7 18:55:25 2021 +0300

    ve/mm: add heuristic check for memory overcommit
    
    Previousely we only had the hard limit for virtual address space size
    inside containers. This was inflexible, because setting it to a small
    value will cause many normal allocations to fail. BTW that's why it is
    left unlimited by default in Vz7. OTOH allowing an application to
    allocate as much virtual address space as it wants may be bad for some
    application expect to be stopped gracefully by mmap returning ENOMEM
    instead of being killed by OOM.
    
    So this patch introduces the "heuristic" mode of overcommit accounting
    inside containers similar to the one used on most hosts by default
    (vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
    changing the value of vm.ve_overcommit_memory sysctl. Per-ve
    configuration is not supported yet, but it may be added later if needed.
    
    Then enabled (vm.ve_overcommit_memory = 0, this is the default), an
    application inside a container will fail to allocate a virtual address
    range if its length is greater than the amount of reclaimable memory
    accounted to the container.
    
    https://jira.sw.ru/browse/PSBM-45695
    https://jira.sw.ru/browse/PSBM-48891
    
    Rebase to vz8:
    
    There are no beancounters now, so:
    - Take amount of reclaimable memory used for allocation checks from
      current ve's memcg;
    - Let's increment failcount for ve-root memory cgroup (memsw, as we use
      memsw.max as a base limit);
    - Rename ubc.overcommit_memory to vm.ve_overcommit_memory;
    
    https://jira.sw.ru/browse/PSBM-129225
    (cherry-picked from vz7 commit 69ab7b978c6 ("ub: add heuristic check for
    memory overcommit"))
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 include/linux/mm.h |  3 +++
 kernel/sysctl.c    | 11 +++++++++++
 mm/memcontrol.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/util.c          |  3 +++
 4 files changed, 57 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ae180a3f4a9..c601be7bbd62 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -149,6 +149,9 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
 
+extern int sysctl_ve_overcommit_memory;
+extern int ve_enough_memory(long pages);
+
 extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *,
 				    size_t *, loff_t *);
 extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98b230fe580c..228dfb6d7609 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1317,6 +1317,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname	= "ve_overcommit_memory",
+		.data		= &sysctl_ve_overcommit_memory,
+		.maxlen		= sizeof(sysctl_ve_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 43efd7998c81..47d8069a026f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -70,6 +70,7 @@
 #include <linux/seq_buf.h>
 #include <linux/virtinfo.h>
 #include <linux/migrate.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -3868,6 +3869,45 @@ void mem_cgroup_fill_vmstat(struct mem_cgroup *memcg, unsigned long *stats)
 #endif
 }
 
+int sysctl_ve_overcommit_memory __read_mostly;
+
+static int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.max - page_counter_read(&memcg->memory);
+
+	/* reclaimable slabs */
+	free += memcg_page_state(memcg, NR_SLAB_RECLAIMABLE);
+
+	/* assume file cache is reclaimable */
+	free += memcg_page_state(memcg, MEMCG_CACHE);
+
+	return free < pages ? -ENOMEM : 0;
+}
+
+int ve_enough_memory(long pages)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	int ret;
+
+	if (ve_is_super(ve) || sysctl_ve_overcommit_memory)
+		return 0;
+
+	css = ve_get_init_css(ve, memory_cgrp_id);
+	memcg = mem_cgroup_from_css(css);
+	ret = mem_cgroup_enough_memory(memcg, pages);
+
+	if (unlikely(ret < 0))
+		memcg->memsw.failcnt++;
+
+	css_put(css);
+	return ret;
+}
+
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
diff --git a/mm/util.c b/mm/util.c
index b8b568fce0f9..fc3d40eb1fc0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -723,6 +723,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			-(s64)vm_committed_as_batch * num_online_cpus(),
 			"memory commitment underflow");
 
+	if (ve_enough_memory(pages))
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*