[Devel] [PATCH v2] ve/fs: add per-VE limit of mount points

Thu Nov 12 01:22:11 PST 2015

11.11.2015 15:01, Stanislav Kinsburskiy пишет:
> Thanks. That's much better.
> But maybe we could simplify it even more?
> Please, take a look at my comments inlined. They allow to reduce impact
> to generic code even further and simplify future rebasing (which is
> usually a lot of pain).

Good point. I will address this in v.3 of the patch. Thanks!

>
> 11.11.2015 10:33, Evgenii Shatokhin пишет:
>> https://jira.sw.ru/browse/PSBM-34438
>>
>> (This fix was adapted from PCS6.)
>>
>> It is possible for a container to create lots of mount points, which may
>> make operations with them slower. As some of these operations take
>> global locks (namespace_sem, vfsmount_lock), it might affect other
>> containers as well.
>>
>> Let us limit the maximum number of mount points a VE may create. The
>> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>>
>> Changes in v.2:
>>
>> * The situations where VE0 mounts something and another VE unmounts it
>>    seem to be of no concern. If so, it is OK not to alter struct  mount:
>>    the mount counter for a VE may become unbalanced but this is
>>    acceptable here.
>>
>> * The sysctl knob is now defined alongside other VE sysctls.
>>
>> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
>> ---
>>   fs/namespace.c      | 22 ++++++++++++++++++++--
>>   include/linux/ve.h  |  6 ++++++
>>   kernel/ve/ve.c      |  2 ++
>>   kernel/ve/veowner.c | 15 +++++++++++++++
>>   4 files changed, 43 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 8909c13..264c0b5 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -165,7 +165,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>
> /* Somewhere in our headers, or whatever other homebrew files */
> static int ve_mount_allowed(void)
> {
>      return get_exec_env()->mnt_nr < sysctl_ve_mount_nr;
> }
>
>>   static struct mount *alloc_vfsmnt(const char *name)
>>   {
>> -    struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>> +    struct mount *mnt;
>> +    struct ve_struct *ve = get_exec_env();
>> +
>> +    if (ve &&
>> +        atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
>> +        !ve_is_super(ve))
>> +        goto out_mnt_nr_dec;
>> +
>
>   +        if (!ve_mount_allowed())
>   +               return NULL;
>
>> +    mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>       if (mnt) {
>>           int err;
>> @@ -201,7 +209,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>>   #ifdef CONFIG_FSNOTIFY
>>           INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>   #endif
>> -    }
>> +    } else
>> +        goto out_mnt_nr_dec;
>> +
>
> +    ve->mnt_nr++;
>
> With this we can grow above 4096, yes. But we anyway took this number
> from out of nowhere, so I don't think we should really care about it.
>
>>       return mnt;
>>   #ifdef CONFIG_SMP
>> @@ -212,6 +222,9 @@ out_free_id:
>>       mnt_free_id(mnt);
>>   out_free_cache:
>>       kmem_cache_free(mnt_cache, mnt);
>> +out_mnt_nr_dec:
>> +    if (ve)
>> +        atomic_dec(&ve->mnt_nr);
>>       return NULL;
>>   }
>> @@ -542,6 +555,11 @@ int sb_prepare_remount_readonly(struct
>> super_block *sb)
>>   static void free_vfsmnt(struct mount *mnt)
>>   {
>> +    struct ve_struct *ve = get_exec_env();
>> +
>> +    if (ve)
>> +        atomic_dec(&ve->mnt_nr);
>> +
>>       kfree(mnt->mnt_devname);
>>       mnt_free_id(mnt);
>>   #ifdef CONFIG_SMP
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index 86b95c3..02600ac 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -128,6 +128,10 @@ struct ve_struct {
>>       unsigned long        aio_nr;
>>       unsigned long        aio_max_nr;
>>   #endif
>> +    /* Number of mounts. May become unbalanced if VE0 mounts something
>> +     * and the VE unmounts it. This is acceptable.
>> +     */
>> +    atomic_t        mnt_nr;
>
> Since we are exposing this counter to user, and it's compared with some
> dummy "4096" value, we don't really care about exact result.
> IOW, I doubt, that we require an atomic variable here.
> Simple integer will be enough (especially if we take into account, that
> word-related operations are anyway atomic in x86)
>
> +            int                                mnt_nr;
>
>>   };
>>   struct ve_devmnt {
>> @@ -145,6 +149,8 @@ extern int nr_ve;
>>   extern struct proc_dir_entry *proc_vz_dir;
>>   extern struct cgroup_subsys ve_subsys;
>> +extern unsigned int sysctl_ve_mount_nr;
>> +
>>   #ifdef CONFIG_VE_IPTABLES
>>   extern __u64 ve_setup_iptables_mask(__u64 init_mask);
>>   #endif
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index e9219e6..216608c 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>>   #endif
>>       .sched_lat_ve.cur    = &ve0_lat_stats,
>>       .init_cred        = &init_cred,
>> +    .mnt_nr            = ATOMIC_INIT(0),
>>   };
>>   EXPORT_SYMBOL(ve0);
>> @@ -653,6 +654,7 @@ do_init:
>>       ve->aio_nr = 0;
>>       ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>>   #endif
>> +    atomic_set(&ve->mnt_nr, 0);
>    +              ve->mnt_nr = 0;
>>       return &ve->css;
>> diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
>> index 316e4d0..1a7e735 100644
>> --- a/kernel/ve/veowner.c
>> +++ b/kernel/ve/veowner.c
>> @@ -55,6 +55,14 @@ static void prepare_proc(void)
>>   int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT;
>>   static int ve_area_access_check;
>> +/*
>> + * Operations with a big amount of mount points can require a lot of
>> time.
>> + * These operations take the global lock namespace_sem, so they can
>> affect
>> + * other containers. Let us allow no more than sysctl_ve_mount_nr mount
>> + * points for a VE.
>> + */
>> +unsigned int sysctl_ve_mount_nr = 4096;
>> +
>>   static struct ctl_table vz_fs_table[] = {
>>       {
>>           .procname    = "ve-area-access-check",
>> @@ -77,6 +85,13 @@ static struct ctl_table vz_fs_table[] = {
>>           .mode        = 0644 | S_ISVTX,
>>           .proc_handler    = &proc_dointvec_virtual,
>>       },
>> +    {
>> +        .procname       = "ve-mount-nr",
>> +        .data           = &sysctl_ve_mount_nr,
>> +        .maxlen         = sizeof(sysctl_ve_mount_nr),
>> +        .mode           = 0644,
>> +        .proc_handler   = proc_dointvec,
>> +    },
>>       { 0 }
>>   };
>
> .
>