[Devel] [PATCH RHEL7 COMMIT] ub: add heuristic check for memory overcommit

Konstantin Khorenko khorenko at virtuozzo.com
Sat May 14 06:21:45 PDT 2016


Rolled back in vz7.12.21 due to panic on boot https://jira.sw.ru/browse/PSBM-47147

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 05/13/2016 06:13 PM, Konstantin Khorenko wrote:
> The commit is pushed to "branch-rh7-3.10.0-327.10.1.vz7.12.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
> after rh7-3.10.0-327.10.1.vz7.12.19
> ------>
> commit d16b8e34160ea2ca0306640d1ac875136cbb01d3
> Author: Vladimir Davydov <vdavydov at virtuozzo.com>
> Date:   Fri May 13 19:13:56 2016 +0400
>
>      ub: add heuristic check for memory overcommit
>
>      Currently, we only have the hard limit for virtual address space size
>      inside containers. This is inflexible, because setting it to a small
>      value will cause many normal allocations to fail. BTW that's why it is
>      left unlimited by default in Vz7. OTOH allowing an application to
>      allocate as much virtual address space as it wants may be bad for some
>      application expect to be stopped gracefully by mmap returning ENOMEM
>      instead of being killed by OOM.
>
>      So this patch introduces the "heuristic" mode of overcommit accounting
>      inside containers similar to the one used on most hosts by default
>      (vm.overcommit_memory sysctl set to 0). It can be toggled system-wide by
>      changing the value of ubc.overcommit_memory sysctl. Per-beancounter
>      configuration is not supported yet, but it may be added later if needed.
>
>      If enabled (ubc.overcommit_memory = 0, this is the default), an
>      application inside a container will fail to allocate a virtual address
>      range iff its length is greater than the amount of reclaimable memory
>      accounted to the container. Note, the UBC_PRIVVMPAGES limit is still
>      taken into account. If disabled (ubc.overcommit_memory = 1), only the
>      UBC_PRIVVMPAGES limit will be checked.
>
>      https://jira.sw.ru/browse/PSBM-45695
>
>      Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
>      Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
> ---
>   include/bc/vmpages.h    |  4 ++++
>   kernel/bc/beancounter.c |  7 +++++++
>   kernel/bc/vm_pages.c    | 28 ++++++++++++++++++++++++++++
>   mm/memcontrol.c         | 21 +++++++++++++++++++++
>   mm/mmap.c               |  9 +++------
>   5 files changed, 63 insertions(+), 6 deletions(-)
>
> diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h
> index bf63b88..72a5d8e 100644
> --- a/include/bc/vmpages.h
> +++ b/include/bc/vmpages.h
> @@ -17,6 +17,8 @@
>   #include <bc/beancounter.h>
>   #include <bc/decl.h>
>
> +extern int ub_overcommit_memory;
> +
>   /*
>    * Check whether vma has private or copy-on-write mapping.
>    */
> @@ -47,4 +49,6 @@ UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
>   UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
>   			unsigned long size))
>
> +UB_DECLARE_FUNC(int, ub_enough_memory(struct mm_struct *mm, long pages))
> +
>   #endif /* __UB_PAGES_H_ */
> diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
> index 5023bd2..18188f7 100644
> --- a/kernel/bc/beancounter.c
> +++ b/kernel/bc/beancounter.c
> @@ -1135,6 +1135,13 @@ static ctl_table ub_sysctl_table[] = {
>   		.mode		= 0644,
>   		.proc_handler	= &proc_resource_precharge,
>   	},
> +	{
> +		.procname	= "overcommit_memory",
> +		.data		= &ub_overcommit_memory,
> +		.maxlen		= sizeof(ub_overcommit_memory),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
>   #ifdef CONFIG_BC_IO_ACCOUNTING
>   	{
>   		.procname	= "dirty_ratio",
> diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
> index 5e588d1..b04ea13 100644
> --- a/kernel/bc/vm_pages.c
> +++ b/kernel/bc/vm_pages.c
> @@ -24,6 +24,8 @@
>   #include <bc/vmpages.h>
>   #include <bc/proc.h>
>
> +int ub_overcommit_memory;
> +
>   int ub_memory_charge(struct mm_struct *mm, unsigned long size,
>   		unsigned vm_flags, struct file *vm_file, int sv)
>   {
> @@ -119,6 +121,32 @@ void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
>   	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
>   }
>
> +extern int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages);
> +
> +int ub_enough_memory(struct mm_struct *mm, long pages)
> +{
> +	struct user_beancounter *ub;
> +	struct cgroup_subsys_state *css;
> +	int ret;
> +
> +	if (!mm)
> +		return 0;
> +
> +	ub = mm->mm_ub;
> +
> +	if (ub->ub_parms[UB_PRIVVMPAGES].held >
> +	    ub->ub_parms[UB_PRIVVMPAGES].barrier)
> +		return -ENOMEM;
> +
> +	if (ub_overcommit_memory)
> +		return 0;
> +
> +	css = ub_get_mem_css(ub);
> +	ret = mem_cgroup_enough_memory(mem_cgroup_from_cont(css->cgroup), pages);
> +	css_put(css);
> +	return ret;
> +}
> +
>   static int bc_fill_sysinfo(struct user_beancounter *ub,
>   		unsigned long meminfo_val, struct sysinfo *si)
>   {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f52cd8e..b577055 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4976,6 +4976,27 @@ void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
>   	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
>   }
>
> +int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
> +{
> +	long free;
> +
> +	/* unused memory */
> +	free = (res_counter_read_u64(&memcg->memsw, RES_LIMIT) -
> +		res_counter_read_u64(&memcg->memsw, RES_USAGE)) >> PAGE_SHIFT;
> +
> +	/* reclaimable slabs */
> +	free += res_counter_read_u64(&memcg->dcache, RES_USAGE) >> PAGE_SHIFT;
> +
> +	/* assume file cache is reclaimable */
> +	free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
> +
> +	/* but do not count shmem pages as they can't be purged,
> +	 * only swapped out */
> +	free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
> +
> +	return free < pages ? -ENOMEM : 0;
> +}
> +
>   static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
>   {
>   	u64 val;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 417163e..fcd1ea3 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -135,13 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
>   {
>   	unsigned long free, allowed, reserve;
>
> -	vm_acct_memory(pages);
> +	if (mm && ub_enough_memory(mm, pages) != 0)
> +		return -ENOMEM;
>
> -#ifdef CONFIG_BEANCOUNTERS
> -	if (mm && mm->mm_ub->ub_parms[UB_PRIVVMPAGES].held <=
> -			mm->mm_ub->ub_parms[UB_VMGUARPAGES].barrier)
> -		return 0;
> -#endif
> +	vm_acct_memory(pages);
>
>   	/*
>   	 * Sometimes we want to use more memory than we have
> .
>


More information about the Devel mailing list