[Devel] [PATCH] ve/fs: add per-VE limit of mount points

Stanislav Kinsburskiy skinsbursky at odin.com
Mon Nov 2 07:47:07 PST 2015


Yet again these sysctl handles...(((
I have some questions.
Please, see inlined.

02.11.2015 16:11, Evgenii Shatokhin пишет:
> https://jira.sw.ru/browse/PSBM-34438
>
> (This fix was adapted from PCS6.)
>
> It is possible for a container to create lots of mount points, which may
> make operations with them slower. As some of these operations take
> global locks (namespace_sem, vfsmount_lock), it might affect other
> containers as well.
>
> Let us limit the maximum number of mount points a VE may create. The
> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>
> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
> ---
>   fs/mount.h                    |  2 ++
>   fs/namespace.c                | 34 ++++++++++++++++++++++++++++++++--
>   include/linux/mnt_namespace.h |  2 ++
>   include/linux/ve.h            |  1 +
>   include/linux/ve_proto.h      | 11 +++++++++++
>   kernel/sysctl.c               |  8 ++++++++
>   kernel/ve/ve.c                |  2 ++
>   7 files changed, 58 insertions(+), 2 deletions(-)
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 3e16b57..e192ff9 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -50,6 +50,8 @@ struct mount {
>   	int mnt_expiry_mark;		/* true if marked for expiry */
>   	int mnt_pinned;
>   	int mnt_ghosts;
> +
> +	envid_t mnt_owner_veid;
>   };

Do we really need it? Can't we just take get_exec_env() on mount and umount?
Yes, in this case we won't account mount-umount operations from ve0.
But maybe we could just do not care about them?
If yes, then we can avoid adding another field to struct mount (which 
would be great, btw).

>   
>   #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 8909c13..be80e8d 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -42,6 +42,14 @@ static struct list_head mountpoint_hashtable[HASH_SIZE];
>   static struct kmem_cache *mnt_cache __read_mostly;
>   static struct rw_semaphore namespace_sem;
>   
> +/*
> + * Operations with a big amount of mount points can require a lot of time.
> + * These operations take the global lock namespace_sem, so they can affect
> + * other containers. Let us allow no more than sysctl_ve_mount_nr mount
> + * points for a VE.
> + */
> +unsigned int sysctl_ve_mount_nr = 4096;
> +

Defining sysctl to fs/namespace.c looks unnecessary.
The same apply to adding sysctl handling in kernel/sysctl.c below.
Since it's anyway a global variable, can't we place it in /proc/sys/fs/ 
with other ve sysctl's (vz_sysfs_table)?

>   /* /sys/fs */
>   struct kobject *fs_kobj;
>   EXPORT_SYMBOL_GPL(fs_kobj);
> @@ -165,7 +173,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>   
>   static struct mount *alloc_vfsmnt(const char *name)
>   {
> -	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> +	struct mount *mnt;
> +	struct ve_struct *ve = get_exec_env();
> +
> +	if (ve &&
> +	    atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
> +	    !ve_is_super(ve))
> +		goto out_mnt_nr_dec;
> +
> +	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>   	if (mnt) {
>   		int err;
>   
> @@ -190,6 +206,8 @@ static struct mount *alloc_vfsmnt(const char *name)
>   		mnt->mnt_writers = 0;
>   #endif
>   
> +		mnt->mnt_owner_veid = get_veid(ve);
> +
>   		INIT_LIST_HEAD(&mnt->mnt_hash);
>   		INIT_LIST_HEAD(&mnt->mnt_child);
>   		INIT_LIST_HEAD(&mnt->mnt_mounts);
> @@ -201,7 +219,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>   #ifdef CONFIG_FSNOTIFY
>   		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>   #endif
> -	}
> +	} else
> +		goto out_mnt_nr_dec;
> +
>   	return mnt;
>   
>   #ifdef CONFIG_SMP
> @@ -212,6 +232,9 @@ out_free_id:
>   	mnt_free_id(mnt);
>   out_free_cache:
>   	kmem_cache_free(mnt_cache, mnt);
> +out_mnt_nr_dec:
> +	if (ve)
> +		atomic_dec(&ve->mnt_nr);
>   	return NULL;
>   }
>   
> @@ -542,6 +565,13 @@ int sb_prepare_remount_readonly(struct super_block *sb)
>   
>   static void free_vfsmnt(struct mount *mnt)
>   {
> +	struct ve_struct *ve = get_ve_by_id(mnt->mnt_owner_veid);
> +
> +	if (ve) {
> +		atomic_dec(&ve->mnt_nr);
> +		put_ve(ve);
> +	}
> +
>   	kfree(mnt->mnt_devname);
>   	mnt_free_id(mnt);
>   #ifdef CONFIG_SMP
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 12b2ab5..4d83d3c 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -14,5 +14,7 @@ extern const struct file_operations proc_mounts_operations;
>   extern const struct file_operations proc_mountinfo_operations;
>   extern const struct file_operations proc_mountstats_operations;
>   
> +extern unsigned int sysctl_ve_mount_nr;
> +
>   #endif
>   #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 86b95c3..c6530c2 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -128,6 +128,7 @@ struct ve_struct {
>   	unsigned long		aio_nr;
>   	unsigned long		aio_max_nr;
>   #endif
> +	atomic_t		mnt_nr;
>   };
>   
>   struct ve_devmnt {
> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
> index 0f5898e..3894c60 100644
> --- a/include/linux/ve_proto.h
> +++ b/include/linux/ve_proto.h
> @@ -32,6 +32,7 @@ static inline bool ve_is_super(struct ve_struct *ve)
>   
>   #define get_exec_env()		(current->task_ve)
>   #define get_env_init(ve)	(ve->ve_ns->pid_ns->child_reaper)
> +#define get_veid(ve)		(ve->veid)
>   
>   const char *ve_name(struct ve_struct *ve);
>   
> @@ -124,6 +125,11 @@ static inline struct ve_struct *get_exec_env(void)
>   
>   #define get_env_init(ve)	(ve->ve_ns->pid_ns->child_reaper)
>   
> +static inline envid_t get_veid(struct ve_struct *ve)
> +{
> +	return 0;
> +}
> +
>   static inline bool ve_is_super(struct ve_struct *ve)
>   {
>   	return true;
> @@ -139,6 +145,11 @@ static inline const char *task_ve_name(struct task_struct *task)
>   	return "0";
>   }
>   
> +static inline struct ve_struct *get_ve_by_id(envid_t veid)
> +{
> +	return NULL;
> +}
> +
>   #define nr_threads_ve(ve)	(nr_threads)
>   
>   #endif /* CONFIG_VE */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 9c081e3..ba69200 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -64,6 +64,7 @@
>   #include <linux/sched/sysctl.h>
>   #include <linux/kexec.h>
>   #include <linux/ve.h>
> +#include <linux/mnt_namespace.h>
>   
>   #include <asm/uaccess.h>
>   #include <asm/processor.h>
> @@ -1740,6 +1741,13 @@ static struct ctl_table fs_table[] = {
>   		.proc_handler	= &pipe_proc_fn,
>   		.extra1		= &pipe_min_size,
>   	},
> +	{
> +		.procname	= "ve-mount-nr",
> +		.data		= &sysctl_ve_mount_nr,
> +		.maxlen		= sizeof(sysctl_ve_mount_nr),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
>   	{ }
>   };
>   

And, actually, the final question: is sysctl the best way to solve this 
task?
This magic "4096" number of mounts also looks odd.
Maybe it makes sense to make this feature per-ve, and place it's handler 
not to sysctls, but to ve cgroup control files?

> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index e9219e6..216608c 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>   #endif
>   	.sched_lat_ve.cur	= &ve0_lat_stats,
>   	.init_cred		= &init_cred,
> +	.mnt_nr			= ATOMIC_INIT(0),
>   };
>   EXPORT_SYMBOL(ve0);
>   
> @@ -653,6 +654,7 @@ do_init:
>   	ve->aio_nr = 0;
>   	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>   #endif
> +	atomic_set(&ve->mnt_nr, 0);
>   
>   	return &ve->css;
>   




More information about the Devel mailing list