[Devel] [PATCH RHEL COMMIT] ve/mm: add heuristic check for memory overcommit

Thu Sep 30 18:08:38 MSK 2021

The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 8435107913b174e545cb779949e935c1ad5d2028
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Thu Sep 30 18:08:38 2021 +0300

    ve/mm: add heuristic check for memory overcommit
    
    Previousely we only had the hard limit for virtual address space size
    inside containers. This was inflexible, because setting it to a small
    value will cause many normal allocations to fail. BTW that's why it is
    left unlimited by default in Vz7. OTOH allowing an application to
    allocate as much virtual address space as it wants may be bad for some
    application expect to be stopped gracefully by mmap returning ENOMEM
    instead of being killed by OOM.
    
    So this patch introduces the "heuristic" mode of overcommit accounting
    inside containers similar to the one used on most hosts by default
    (vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
    changing the value of vm.ve_overcommit_memory sysctl. Per-ve
    configuration is not supported yet, but it may be added later if needed.
    
    Then enabled (vm.ve_overcommit_memory = 0, this is the default), an
    application inside a container will fail to allocate a virtual address
    range if its length is greater than the amount of reclaimable memory
    accounted to the container.
    
    https://jira.sw.ru/browse/PSBM-45695
    https://jira.sw.ru/browse/PSBM-48891
    
    Rebase to vz8:
    
    There are no beancounters now, so:
    - Take amount of reclaimable memory used for allocation checks from
      current ve's memcg;
    - Let's increment failcount for ve-root memory cgroup (memsw, as we use
      memsw.max as a base limit);
    - Rename ubc.overcommit_memory to vm.ve_overcommit_memory;
    
    https://jira.sw.ru/browse/PSBM-129225
    (cherry-picked from vz7 commit 69ab7b978c6 ("ub: add heuristic check for
    memory overcommit"))
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    +++
    ve/mm: Honor changing per-memcg s[un]reclaimable counters to bytes in heuristic check for memory overcommit
    
    RHEL8.4 has following ms commit backported:
    d42f3245c7e2 ("mm: memcg: convert vmstat slab counters to bytes")
    
    So, update places were we use per-memcg counters NR_SLAB_[UN]RECLAIMABLE_B
    accordingly.
    
    https://jira.sw.ru/browse/PSBM-132893
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    (cherry-picked from vz8 commit 6627a6313e4e ("ve/mm: add heuristic check
    for memory overcommit"))
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/linux/mm.h |  4 +++-
 kernel/sysctl.c    | 11 +++++++++++
 mm/memcontrol.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/util.c          |  3 +++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 63380ba4ecaa..e59e27f4a528 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -211,6 +211,9 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
 
+extern int sysctl_ve_overcommit_memory;
+extern int ve_enough_memory(long pages);
+
 int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
@@ -224,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
  */
 int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		pgoff_t index, gfp_t gfp, void **shadowp);
-
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 #else
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abb6df3b1d0..2f9ee645c63b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2774,6 +2774,17 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname       = "ve_overcommit_memory",
+		.data           = &sysctl_ve_overcommit_memory,
+		.maxlen         = sizeof(sysctl_ve_overcommit_memory),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+#endif
 	{
 		.procname	= "panic_on_oom",
 		.data		= &sysctl_panic_on_oom,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 41c3d5f25fba..a5b893a4721f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -65,6 +65,7 @@
 #include <linux/seq_buf.h>
 #include <linux/virtinfo.h>
 #include <linux/migrate.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -4091,6 +4092,45 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 	/* mi->locked = 0; */
 }
 
+int sysctl_ve_overcommit_memory __read_mostly;
+
+static int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.max - page_counter_read(&memcg->memory);
+
+	/* reclaimable slabs */
+	free += memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+
+	/* assume file cache is reclaimable */
+	free += memcg_page_state(memcg, NR_FILE_PAGES);
+
+	return free < pages ? -ENOMEM : 0;
+}
+
+int ve_enough_memory(long pages)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+	int ret;
+
+	if (ve_is_super(ve) || sysctl_ve_overcommit_memory)
+		return 0;
+
+	css = ve_get_init_css(ve, memory_cgrp_id);
+	memcg = mem_cgroup_from_css(css);
+	ret = mem_cgroup_enough_memory(memcg, pages);
+
+	if (unlikely(ret < 0))
+		memcg->memsw.failcnt++;
+
+	css_put(css);
+	return ret;
+}
+
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
diff --git a/mm/util.c b/mm/util.c
index 90f35714ec8a..d188dd74de61 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -888,6 +888,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	long allowed;
 
+	if (ve_enough_memory(pages))
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*