[Devel] [PATCH v2] ve/fs: add per-VE limit of mount points

Wed Nov 11 04:01:26 PST 2015

Thanks. That's much better.
But maybe we could simplify it even more?
Please, take a look at my comments inlined. They allow to reduce impact 
to generic code even further and simplify future rebasing (which is 
usually a lot of pain).

11.11.2015 10:33, Evgenii Shatokhin пишет:
> https://jira.sw.ru/browse/PSBM-34438
>
> (This fix was adapted from PCS6.)
>
> It is possible for a container to create lots of mount points, which may
> make operations with them slower. As some of these operations take
> global locks (namespace_sem, vfsmount_lock), it might affect other
> containers as well.
>
> Let us limit the maximum number of mount points a VE may create. The
> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>
> Changes in v.2:
>
> * The situations where VE0 mounts something and another VE unmounts it
>    seem to be of no concern. If so, it is OK not to alter struct  mount:
>    the mount counter for a VE may become unbalanced but this is
>    acceptable here.
>
> * The sysctl knob is now defined alongside other VE sysctls.
>
> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
> ---
>   fs/namespace.c      | 22 ++++++++++++++++++++--
>   include/linux/ve.h  |  6 ++++++
>   kernel/ve/ve.c      |  2 ++
>   kernel/ve/veowner.c | 15 +++++++++++++++
>   4 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8909c13..264c0b5 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -165,7 +165,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>   

/* Somewhere in our headers, or whatever other homebrew files */
static int ve_mount_allowed(void)
{
     return get_exec_env()->mnt_nr < sysctl_ve_mount_nr;
}

>   static struct mount *alloc_vfsmnt(const char *name)
>   {
> -	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> +	struct mount *mnt;
> +	struct ve_struct *ve = get_exec_env();
> +
> +	if (ve &&
> +	    atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
> +	    !ve_is_super(ve))
> +		goto out_mnt_nr_dec;
> +

  +        if (!ve_mount_allowed())
  +               return NULL;

> +	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>   	if (mnt) {
>   		int err;
>   
> @@ -201,7 +209,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>   #ifdef CONFIG_FSNOTIFY
>   		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>   #endif
> -	}
> +	} else
> +		goto out_mnt_nr_dec;
> +

+	ve->mnt_nr++;

With this we can grow above 4096, yes. But we anyway took this number
from out of nowhere, so I don't think we should really care about it.

>   	return mnt;
>   
>   #ifdef CONFIG_SMP
> @@ -212,6 +222,9 @@ out_free_id:
>   	mnt_free_id(mnt);
>   out_free_cache:
>   	kmem_cache_free(mnt_cache, mnt);
> +out_mnt_nr_dec:
> +	if (ve)
> +		atomic_dec(&ve->mnt_nr);
>   	return NULL;
>   }
>   
> @@ -542,6 +555,11 @@ int sb_prepare_remount_readonly(struct super_block *sb)
>   
>   static void free_vfsmnt(struct mount *mnt)
>   {
> +	struct ve_struct *ve = get_exec_env();
> +
> +	if (ve)
> +		atomic_dec(&ve->mnt_nr);
> +
>   	kfree(mnt->mnt_devname);
>   	mnt_free_id(mnt);
>   #ifdef CONFIG_SMP
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 86b95c3..02600ac 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -128,6 +128,10 @@ struct ve_struct {
>   	unsigned long		aio_nr;
>   	unsigned long		aio_max_nr;
>   #endif
> +	/* Number of mounts. May become unbalanced if VE0 mounts something
> +	 * and the VE unmounts it. This is acceptable.
> +	 */
> +	atomic_t		mnt_nr;

Since we are exposing this counter to user, and it's compared with some 
dummy "4096" value, we don't really care about exact result.
IOW, I doubt, that we require an atomic variable here.
Simple integer will be enough (especially if we take into account, that 
word-related operations are anyway atomic in x86)

+            int                                mnt_nr;

>   };
>   
>   struct ve_devmnt {
> @@ -145,6 +149,8 @@ extern int nr_ve;
>   extern struct proc_dir_entry *proc_vz_dir;
>   extern struct cgroup_subsys ve_subsys;
>   
> +extern unsigned int sysctl_ve_mount_nr;
> +
>   #ifdef CONFIG_VE_IPTABLES
>   extern __u64 ve_setup_iptables_mask(__u64 init_mask);
>   #endif
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index e9219e6..216608c 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>   #endif
>   	.sched_lat_ve.cur	= &ve0_lat_stats,
>   	.init_cred		= &init_cred,
> +	.mnt_nr			= ATOMIC_INIT(0),
>   };
>   EXPORT_SYMBOL(ve0);
>   
> @@ -653,6 +654,7 @@ do_init:
>   	ve->aio_nr = 0;
>   	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>   #endif
> +	atomic_set(&ve->mnt_nr, 0);
   +              ve->mnt_nr = 0;
>   
>   	return &ve->css;
>   
> diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
> index 316e4d0..1a7e735 100644
> --- a/kernel/ve/veowner.c
> +++ b/kernel/ve/veowner.c
> @@ -55,6 +55,14 @@ static void prepare_proc(void)
>   int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT;
>   static int ve_area_access_check;
>   
> +/*
> + * Operations with a big amount of mount points can require a lot of time.
> + * These operations take the global lock namespace_sem, so they can affect
> + * other containers. Let us allow no more than sysctl_ve_mount_nr mount
> + * points for a VE.
> + */
> +unsigned int sysctl_ve_mount_nr = 4096;
> +
>   static struct ctl_table vz_fs_table[] = {
>   	{
>   		.procname	= "ve-area-access-check",
> @@ -77,6 +85,13 @@ static struct ctl_table vz_fs_table[] = {
>   		.mode		= 0644 | S_ISVTX,
>   		.proc_handler	= &proc_dointvec_virtual,
>   	},
> +	{
> +		.procname       = "ve-mount-nr",
> +		.data           = &sysctl_ve_mount_nr,
> +		.maxlen         = sizeof(sysctl_ve_mount_nr),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec,
> +	},
>   	{ 0 }
>   };
>