[Devel] [PATCH] ve/fs: add per-VE limit of mount points

Evgenii Shatokhin eshatokhin at odin.com
Wed Nov 4 23:35:56 PST 2015


02.11.2015 18:47, Stanislav Kinsburskiy пишет:
> Yet again these sysctl handles...(((
> I have some questions.
> Please, see inlined.
>
> 02.11.2015 16:11, Evgenii Shatokhin пишет:
>> https://jira.sw.ru/browse/PSBM-34438
>>
>> (This fix was adapted from PCS6.)
>>
>> It is possible for a container to create lots of mount points, which may
>> make operations with them slower. As some of these operations take
>> global locks (namespace_sem, vfsmount_lock), it might affect other
>> containers as well.
>>
>> Let us limit the maximum number of mount points a VE may create. The
>> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>>
>> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
>> ---
>>   fs/mount.h                    |  2 ++
>>   fs/namespace.c                | 34 ++++++++++++++++++++++++++++++++--
>>   include/linux/mnt_namespace.h |  2 ++
>>   include/linux/ve.h            |  1 +
>>   include/linux/ve_proto.h      | 11 +++++++++++
>>   kernel/sysctl.c               |  8 ++++++++
>>   kernel/ve/ve.c                |  2 ++
>>   7 files changed, 58 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/mount.h b/fs/mount.h
>> index 3e16b57..e192ff9 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -50,6 +50,8 @@ struct mount {
>>       int mnt_expiry_mark;        /* true if marked for expiry */
>>       int mnt_pinned;
>>       int mnt_ghosts;
>> +
>> +    envid_t mnt_owner_veid;
>>   };
>
> Do we really need it? Can't we just take get_exec_env() on mount and
> umount?
> Yes, in this case we won't account mount-umount operations from ve0.
> But maybe we could just do not care about them?
> If yes, then we can avoid adding another field to struct mount (which
> would be great, btw).

Good point. Personally, I would rather leave struct mount alone. 
However, I am not 100% sure yet it is safe.

Is it possible for mount to be called from a VE and umount from VE0? If 
so, then without that field, ve->mnt_nr would not be decremented during 
free_vfsmnt() and would become unbalanced.

If such conditions are impossible, then yes, I suppose, we can do 
without additional fields in struct mount.

I introduced it only because the patch for PCS6 I backported used this 
approach.

Perhaps, the authors of that patch could shed some light?

>
>>   #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any
>> mnt_namespace */
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 8909c13..be80e8d 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -42,6 +42,14 @@ static struct list_head
>> mountpoint_hashtable[HASH_SIZE];
>>   static struct kmem_cache *mnt_cache __read_mostly;
>>   static struct rw_semaphore namespace_sem;
>> +/*
>> + * Operations with a big amount of mount points can require a lot of
>> time.
>> + * These operations take the global lock namespace_sem, so they can
>> affect
>> + * other containers. Let us allow no more than sysctl_ve_mount_nr mount
>> + * points for a VE.
>> + */
>> +unsigned int sysctl_ve_mount_nr = 4096;
>> +
>
> Defining sysctl to fs/namespace.c looks unnecessary.
> The same apply to adding sysctl handling in kernel/sysctl.c below.
> Since it's anyway a global variable, can't we place it in /proc/sys/fs/
> with other ve sysctl's (vz_sysfs_table)?
>
>>   /* /sys/fs */
>>   struct kobject *fs_kobj;
>>   EXPORT_SYMBOL_GPL(fs_kobj);
>> @@ -165,7 +173,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>>   static struct mount *alloc_vfsmnt(const char *name)
>>   {
>> -    struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>> +    struct mount *mnt;
>> +    struct ve_struct *ve = get_exec_env();
>> +
>> +    if (ve &&
>> +        atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
>> +        !ve_is_super(ve))
>> +        goto out_mnt_nr_dec;
>> +
>> +    mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>       if (mnt) {
>>           int err;
>> @@ -190,6 +206,8 @@ static struct mount *alloc_vfsmnt(const char *name)
>>           mnt->mnt_writers = 0;
>>   #endif
>> +        mnt->mnt_owner_veid = get_veid(ve);
>> +
>>           INIT_LIST_HEAD(&mnt->mnt_hash);
>>           INIT_LIST_HEAD(&mnt->mnt_child);
>>           INIT_LIST_HEAD(&mnt->mnt_mounts);
>> @@ -201,7 +219,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>>   #ifdef CONFIG_FSNOTIFY
>>           INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>   #endif
>> -    }
>> +    } else
>> +        goto out_mnt_nr_dec;
>> +
>>       return mnt;
>>   #ifdef CONFIG_SMP
>> @@ -212,6 +232,9 @@ out_free_id:
>>       mnt_free_id(mnt);
>>   out_free_cache:
>>       kmem_cache_free(mnt_cache, mnt);
>> +out_mnt_nr_dec:
>> +    if (ve)
>> +        atomic_dec(&ve->mnt_nr);
>>       return NULL;
>>   }
>> @@ -542,6 +565,13 @@ int sb_prepare_remount_readonly(struct
>> super_block *sb)
>>   static void free_vfsmnt(struct mount *mnt)
>>   {
>> +    struct ve_struct *ve = get_ve_by_id(mnt->mnt_owner_veid);
>> +
>> +    if (ve) {
>> +        atomic_dec(&ve->mnt_nr);
>> +        put_ve(ve);
>> +    }
>> +
>>       kfree(mnt->mnt_devname);
>>       mnt_free_id(mnt);
>>   #ifdef CONFIG_SMP
>> diff --git a/include/linux/mnt_namespace.h
>> b/include/linux/mnt_namespace.h
>> index 12b2ab5..4d83d3c 100644
>> --- a/include/linux/mnt_namespace.h
>> +++ b/include/linux/mnt_namespace.h
>> @@ -14,5 +14,7 @@ extern const struct file_operations
>> proc_mounts_operations;
>>   extern const struct file_operations proc_mountinfo_operations;
>>   extern const struct file_operations proc_mountstats_operations;
>> +extern unsigned int sysctl_ve_mount_nr;
>> +
>>   #endif
>>   #endif
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index 86b95c3..c6530c2 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -128,6 +128,7 @@ struct ve_struct {
>>       unsigned long        aio_nr;
>>       unsigned long        aio_max_nr;
>>   #endif
>> +    atomic_t        mnt_nr;
>>   };
>>   struct ve_devmnt {
>> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
>> index 0f5898e..3894c60 100644
>> --- a/include/linux/ve_proto.h
>> +++ b/include/linux/ve_proto.h
>> @@ -32,6 +32,7 @@ static inline bool ve_is_super(struct ve_struct *ve)
>>   #define get_exec_env()        (current->task_ve)
>>   #define get_env_init(ve)    (ve->ve_ns->pid_ns->child_reaper)
>> +#define get_veid(ve)        (ve->veid)
>>   const char *ve_name(struct ve_struct *ve);
>> @@ -124,6 +125,11 @@ static inline struct ve_struct *get_exec_env(void)
>>   #define get_env_init(ve)    (ve->ve_ns->pid_ns->child_reaper)
>> +static inline envid_t get_veid(struct ve_struct *ve)
>> +{
>> +    return 0;
>> +}
>> +
>>   static inline bool ve_is_super(struct ve_struct *ve)
>>   {
>>       return true;
>> @@ -139,6 +145,11 @@ static inline const char *task_ve_name(struct
>> task_struct *task)
>>       return "0";
>>   }
>> +static inline struct ve_struct *get_ve_by_id(envid_t veid)
>> +{
>> +    return NULL;
>> +}
>> +
>>   #define nr_threads_ve(ve)    (nr_threads)
>>   #endif /* CONFIG_VE */
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index 9c081e3..ba69200 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -64,6 +64,7 @@
>>   #include <linux/sched/sysctl.h>
>>   #include <linux/kexec.h>
>>   #include <linux/ve.h>
>> +#include <linux/mnt_namespace.h>
>>   #include <asm/uaccess.h>
>>   #include <asm/processor.h>
>> @@ -1740,6 +1741,13 @@ static struct ctl_table fs_table[] = {
>>           .proc_handler    = &pipe_proc_fn,
>>           .extra1        = &pipe_min_size,
>>       },
>> +    {
>> +        .procname    = "ve-mount-nr",
>> +        .data        = &sysctl_ve_mount_nr,
>> +        .maxlen        = sizeof(sysctl_ve_mount_nr),
>> +        .mode        = 0644,
>> +        .proc_handler    = proc_dointvec,
>> +    },
>>       { }
>>   };
>
> And, actually, the final question: is sysctl the best way to solve this
> task?
> This magic "4096" number of mounts also looks odd.
> Maybe it makes sense to make this feature per-ve, and place it's handler
> not to sysctls, but to ve cgroup control files?
>
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index e9219e6..216608c 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>>   #endif
>>       .sched_lat_ve.cur    = &ve0_lat_stats,
>>       .init_cred        = &init_cred,
>> +    .mnt_nr            = ATOMIC_INIT(0),
>>   };
>>   EXPORT_SYMBOL(ve0);
>> @@ -653,6 +654,7 @@ do_init:
>>       ve->aio_nr = 0;
>>       ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>>   #endif
>> +    atomic_set(&ve->mnt_nr, 0);
>>       return &ve->css;
>
> .
>

Regards,
Evgenii




More information about the Devel mailing list