[Devel] [PATCH RHEL7 COMMIT] ub: add heuristic check for memory overcommit

Fri May 13 08:13:56 PDT 2016

The commit is pushed to "branch-rh7-3.10.0-327.10.1.vz7.12.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.10.1.vz7.12.19
------>
commit d16b8e34160ea2ca0306640d1ac875136cbb01d3
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date:   Fri May 13 19:13:56 2016 +0400

    ub: add heuristic check for memory overcommit
    
    Currently, we only have the hard limit for virtual address space size
    inside containers. This is inflexible, because setting it to a small
    value will cause many normal allocations to fail. BTW that's why it is
    left unlimited by default in Vz7. OTOH allowing an application to
    allocate as much virtual address space as it wants may be bad for some
    application expect to be stopped gracefully by mmap returning ENOMEM
    instead of being killed by OOM.
    
    So this patch introduces the "heuristic" mode of overcommit accounting
    inside containers similar to the one used on most hosts by default
    (vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
    changing the value of ubc.overcommit_memory sysctl. Per-beancounter
    configuration is not supported yet, but it may be added later if needed.
    
    If enabled (ubc.overcommit_memory = 0, this is the default), an
    application inside a container will fail to allocate a virtual address
    range iff its length is greater than the amount of reclaimable memory
    accounted to the container. Note, the UBC_PRIVVMPAGES limit is still
    taken into account. If disabled (ubc.overcommit_memory = 1), only the
    UBC_PRIVVMPAGES limit will be checked.
    
    https://jira.sw.ru/browse/PSBM-45695
    
    Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 include/bc/vmpages.h    |  4 ++++
 kernel/bc/beancounter.c |  7 +++++++
 kernel/bc/vm_pages.c    | 28 ++++++++++++++++++++++++++++
 mm/memcontrol.c         | 21 +++++++++++++++++++++
 mm/mmap.c               |  9 +++------
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h
index bf63b88..72a5d8e 100644
--- a/include/bc/vmpages.h
+++ b/include/bc/vmpages.h
@@ -17,6 +17,8 @@
 #include <bc/beancounter.h>
 #include <bc/decl.h>
 
+extern int ub_overcommit_memory;
+
 /*
  * Check whether vma has private or copy-on-write mapping.
  */
@@ -47,4 +49,6 @@ UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
 UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
 			unsigned long size))
 
+UB_DECLARE_FUNC(int, ub_enough_memory(struct mm_struct *mm, long pages))
+
 #endif /* __UB_PAGES_H_ */
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
index 5023bd2..18188f7 100644
--- a/kernel/bc/beancounter.c
+++ b/kernel/bc/beancounter.c
@@ -1135,6 +1135,13 @@ static ctl_table ub_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_resource_precharge,
 	},
+	{
+		.procname	= "overcommit_memory",
+		.data		= &ub_overcommit_memory,
+		.maxlen		= sizeof(ub_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_BC_IO_ACCOUNTING
 	{
 		.procname	= "dirty_ratio",
diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
index 5e588d1..b04ea13 100644
--- a/kernel/bc/vm_pages.c
+++ b/kernel/bc/vm_pages.c
@@ -24,6 +24,8 @@
 #include <bc/vmpages.h>
 #include <bc/proc.h>
 
+int ub_overcommit_memory;
+
 int ub_memory_charge(struct mm_struct *mm, unsigned long size,
 		unsigned vm_flags, struct file *vm_file, int sv)
 {
@@ -119,6 +121,32 @@ void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
 	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
 }
 
+extern int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages);
+
+int ub_enough_memory(struct mm_struct *mm, long pages)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (!mm)
+		return 0;
+
+	ub = mm->mm_ub;
+
+	if (ub->ub_parms[UB_PRIVVMPAGES].held >
+	    ub->ub_parms[UB_PRIVVMPAGES].barrier)
+		return -ENOMEM;
+
+	if (ub_overcommit_memory)
+		return 0;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_enough_memory(mem_cgroup_from_cont(css->cgroup), pages);
+	css_put(css);
+	return ret;
+}
+
 static int bc_fill_sysinfo(struct user_beancounter *ub,
 		unsigned long meminfo_val, struct sysinfo *si)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f52cd8e..b577055 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4976,6 +4976,27 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
 }
 
+int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = (res_counter_read_u64(&memcg->memsw, RES_LIMIT) -
+		res_counter_read_u64(&memcg->memsw, RES_USAGE)) >> PAGE_SHIFT;
+
+	/* reclaimable slabs */
+	free += res_counter_read_u64(&memcg->dcache, RES_USAGE) >> PAGE_SHIFT;
+
+	/* assume file cache is reclaimable */
+	free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+
+	/* but do not count shmem pages as they can't be purged,
+	 * only swapped out */
+	free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+
+	return free < pages ? -ENOMEM : 0;
+}
+
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
diff --git a/mm/mmap.c b/mm/mmap.c
index 417163e..fcd1ea3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -135,13 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed, reserve;
 
-	vm_acct_memory(pages);
+	if (mm && ub_enough_memory(mm, pages) != 0)
+		return -ENOMEM;
 
-#ifdef CONFIG_BEANCOUNTERS
-	if (mm && mm->mm_ub->ub_parms[UB_PRIVVMPAGES].held <=
-			mm->mm_ub->ub_parms[UB_VMGUARPAGES].barrier)
-		return 0;
-#endif
+	vm_acct_memory(pages);
 
 	/*
 	 * Sometimes we want to use more memory than we have