[Devel] [PATCH RHEL7 COMMIT] ub: add heuristic check for memory overcommit
Konstantin Khorenko
khorenko at virtuozzo.com
Tue May 17 07:49:43 PDT 2016
The commit is pushed to "branch-rh7-3.10.0-327.18.2.vz7.14.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.18.2.vz7.14.3
------>
commit ac88fd4037223c715e051920777e3332dace05c3
Author: Vladimir Davydov <vdavydov at virtuozzo.com>
Date: Fri May 13 19:13:56 2016 +0400
ub: add heuristic check for memory overcommit
Currently, we only have the hard limit for virtual address space size
inside containers. This is inflexible, because setting it to a small
value will cause many normal allocations to fail. BTW that's why it is
left unlimited by default in Vz7. OTOH allowing an application to
allocate as much virtual address space as it wants may be bad for some
application expect to be stopped gracefully by mmap returning ENOMEM
instead of being killed by OOM.
So this patch introduces the "heuristic" mode of overcommit accounting
inside containers similar to the one used on most hosts by default
(vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
changing the value of ubc.overcommit_memory sysctl. Per-beancounter
configuration is not supported yet, but it may be added later if needed.
If enabled (ubc.overcommit_memory = 0, this is the default), an
application inside a container will fail to allocate a virtual address
range iff its length is greater than the amount of reclaimable memory
accounted to the container. Note, the UBC_PRIVVMPAGES limit is still
taken into account. If disabled (ubc.overcommit_memory = 1), only the
UBC_PRIVVMPAGES limit will be checked.
https://jira.sw.ru/browse/PSBM-45695
Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
include/bc/vmpages.h | 4 ++++
kernel/bc/beancounter.c | 7 +++++++
kernel/bc/vm_pages.c | 28 ++++++++++++++++++++++++++++
mm/memcontrol.c | 21 +++++++++++++++++++++
mm/mmap.c | 9 +++------
5 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h
index bf63b88..72a5d8e 100644
--- a/include/bc/vmpages.h
+++ b/include/bc/vmpages.h
@@ -17,6 +17,8 @@
#include <bc/beancounter.h>
#include <bc/decl.h>
+extern int ub_overcommit_memory;
+
/*
* Check whether vma has private or copy-on-write mapping.
*/
@@ -47,4 +49,6 @@ UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
unsigned long size))
+UB_DECLARE_FUNC(int, ub_enough_memory(struct mm_struct *mm, long pages))
+
#endif /* __UB_PAGES_H_ */
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
index 5023bd2..18188f7 100644
--- a/kernel/bc/beancounter.c
+++ b/kernel/bc/beancounter.c
@@ -1135,6 +1135,13 @@ static ctl_table ub_sysctl_table[] = {
.mode = 0644,
.proc_handler = &proc_resource_precharge,
},
+ {
+ .procname = "overcommit_memory",
+ .data = &ub_overcommit_memory,
+ .maxlen = sizeof(ub_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_BC_IO_ACCOUNTING
{
.procname = "dirty_ratio",
diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
index 5e588d1..b04ea13 100644
--- a/kernel/bc/vm_pages.c
+++ b/kernel/bc/vm_pages.c
@@ -24,6 +24,8 @@
#include <bc/vmpages.h>
#include <bc/proc.h>
+int ub_overcommit_memory;
+
int ub_memory_charge(struct mm_struct *mm, unsigned long size,
unsigned vm_flags, struct file *vm_file, int sv)
{
@@ -119,6 +121,32 @@ void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
}
+extern int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages);
+
+int ub_enough_memory(struct mm_struct *mm, long pages)
+{
+ struct user_beancounter *ub;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!mm)
+ return 0;
+
+ ub = mm->mm_ub;
+
+ if (ub->ub_parms[UB_PRIVVMPAGES].held >
+ ub->ub_parms[UB_PRIVVMPAGES].barrier)
+ return -ENOMEM;
+
+ if (ub_overcommit_memory)
+ return 0;
+
+ css = ub_get_mem_css(ub);
+ ret = mem_cgroup_enough_memory(mem_cgroup_from_cont(css->cgroup), pages);
+ css_put(css);
+ return ret;
+}
+
static int bc_fill_sysinfo(struct user_beancounter *ub,
unsigned long meminfo_val, struct sysinfo *si)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f52cd8e..b577055 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4976,6 +4976,27 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
}
+int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+ long free;
+
+ /* unused memory */
+ free = (res_counter_read_u64(&memcg->memsw, RES_LIMIT) -
+ res_counter_read_u64(&memcg->memsw, RES_USAGE)) >> PAGE_SHIFT;
+
+ /* reclaimable slabs */
+ free += res_counter_read_u64(&memcg->dcache, RES_USAGE) >> PAGE_SHIFT;
+
+ /* assume file cache is reclaimable */
+ free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+
+ /* but do not count shmem pages as they can't be purged,
+ * only swapped out */
+ free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+
+ return free < pages ? -ENOMEM : 0;
+}
+
static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
u64 val;
diff --git a/mm/mmap.c b/mm/mmap.c
index 417163e..fcd1ea3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -135,13 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
unsigned long free, allowed, reserve;
- vm_acct_memory(pages);
+ if (mm && ub_enough_memory(mm, pages) != 0)
+ return -ENOMEM;
-#ifdef CONFIG_BEANCOUNTERS
- if (mm && mm->mm_ub->ub_parms[UB_PRIVVMPAGES].held <=
- mm->mm_ub->ub_parms[UB_VMGUARPAGES].barrier)
- return 0;
-#endif
+ vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
More information about the Devel
mailing list