[Devel] [PATCH] ve/fs: add per-VE limit of mount points

Thu Nov 5 08:37:31 PST 2015

05.11.2015 18:45, Konstantin Khorenko пишет:
> Zenya,
>
> 1) an idea from Volodya Davydov: we can store the pointer to ve_struct,
> not the veid,
>     thus we'll eliminate possible races with veid reuse.

OK.
The main question is though, whether we should care about the situations 
where something is mounted from a VE and umounted from VE0. If we should 
not, it is not needed to change struct mount at all.

>
> 2) there are 2 more comments from Stas (3 total), please take a look.
>

Yes, I saw them.

As for defining the sysctl in vz_fs_table where other VZ-specific ones 
are, there are no objections from me. Same for moving the variable to 
the appropriate file. Will do that in v2 of the patch.

Concerning 4096 - it is there mostly because PCS6 uses the same limit. A 
nice round number, I guess. We could start from there and see.

>
> i don't mind leaving it as a global sysctl because hopefully it will
> never used =>
> less time spent to rework. And we currently really don't need it to be
> per-CT.
>
> --
> Best regards,
>
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
>
> On 11/05/2015 10:35 AM, Evgenii Shatokhin wrote:
>> 02.11.2015 18:47, Stanislav Kinsburskiy пишет:
>>> Yet again these sysctl handles...(((
>>> I have some questions.
>>> Please, see inlined.
>>>
>>> 02.11.2015 16:11, Evgenii Shatokhin пишет:
>>>> https://jira.sw.ru/browse/PSBM-34438
>>>>
>>>> (This fix was adapted from PCS6.)
>>>>
>>>> It is possible for a container to create lots of mount points, which
>>>> may
>>>> make operations with them slower. As some of these operations take
>>>> global locks (namespace_sem, vfsmount_lock), it might affect other
>>>> containers as well.
>>>>
>>>> Let us limit the maximum number of mount points a VE may create. The
>>>> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>>>>
>>>> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
>>>> ---
>>>>    fs/mount.h                    |  2 ++
>>>>    fs/namespace.c                | 34
>>>> ++++++++++++++++++++++++++++++++--
>>>>    include/linux/mnt_namespace.h |  2 ++
>>>>    include/linux/ve.h            |  1 +
>>>>    include/linux/ve_proto.h      | 11 +++++++++++
>>>>    kernel/sysctl.c               |  8 ++++++++
>>>>    kernel/ve/ve.c                |  2 ++
>>>>    7 files changed, 58 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/mount.h b/fs/mount.h
>>>> index 3e16b57..e192ff9 100644
>>>> --- a/fs/mount.h
>>>> +++ b/fs/mount.h
>>>> @@ -50,6 +50,8 @@ struct mount {
>>>>        int mnt_expiry_mark;        /* true if marked for expiry */
>>>>        int mnt_pinned;
>>>>        int mnt_ghosts;
>>>> +
>>>> +    envid_t mnt_owner_veid;
>>>>    };
>>>
>>> Do we really need it? Can't we just take get_exec_env() on mount and
>>> umount?
>>> Yes, in this case we won't account mount-umount operations from ve0.
>>> But maybe we could just do not care about them?
>>> If yes, then we can avoid adding another field to struct mount (which
>>> would be great, btw).
>>
>> Good point. Personally, I would rather leave struct mount alone.
>> However, I am not 100% sure yet it is safe.
>>
>> Is it possible for mount to be called from a VE and umount from VE0? If
>> so, then without that field, ve->mnt_nr would not be decremented during
>> free_vfsmnt() and would become unbalanced.
>>
>> If such conditions are impossible, then yes, I suppose, we can do
>> without additional fields in struct mount.
>>
>> I introduced it only because the patch for PCS6 I backported used this
>> approach.
>>
>> Perhaps, the authors of that patch could shed some light?
>>
>>>
>>>>    #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any
>>>> mnt_namespace */
>>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>>> index 8909c13..be80e8d 100644
>>>> --- a/fs/namespace.c
>>>> +++ b/fs/namespace.c
>>>> @@ -42,6 +42,14 @@ static struct list_head
>>>> mountpoint_hashtable[HASH_SIZE];
>>>>    static struct kmem_cache *mnt_cache __read_mostly;
>>>>    static struct rw_semaphore namespace_sem;
>>>> +/*
>>>> + * Operations with a big amount of mount points can require a lot of
>>>> time.
>>>> + * These operations take the global lock namespace_sem, so they can
>>>> affect
>>>> + * other containers. Let us allow no more than sysctl_ve_mount_nr
>>>> mount
>>>> + * points for a VE.
>>>> + */
>>>> +unsigned int sysctl_ve_mount_nr = 4096;
>>>> +
>>>
>>> Defining sysctl to fs/namespace.c looks unnecessary.
>>> The same apply to adding sysctl handling in kernel/sysctl.c below.
>>> Since it's anyway a global variable, can't we place it in /proc/sys/fs/
>>> with other ve sysctl's (vz_sysfs_table)?
>>>
>>>>    /* /sys/fs */
>>>>    struct kobject *fs_kobj;
>>>>    EXPORT_SYMBOL_GPL(fs_kobj);
>>>> @@ -165,7 +173,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>>>>    static struct mount *alloc_vfsmnt(const char *name)
>>>>    {
>>>> -    struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>>> +    struct mount *mnt;
>>>> +    struct ve_struct *ve = get_exec_env();
>>>> +
>>>> +    if (ve &&
>>>> +        atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
>>>> +        !ve_is_super(ve))
>>>> +        goto out_mnt_nr_dec;
>>>> +
>>>> +    mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>>>        if (mnt) {
>>>>            int err;
>>>> @@ -190,6 +206,8 @@ static struct mount *alloc_vfsmnt(const char *name)
>>>>            mnt->mnt_writers = 0;
>>>>    #endif
>>>> +        mnt->mnt_owner_veid = get_veid(ve);
>>>> +
>>>>            INIT_LIST_HEAD(&mnt->mnt_hash);
>>>>            INIT_LIST_HEAD(&mnt->mnt_child);
>>>>            INIT_LIST_HEAD(&mnt->mnt_mounts);
>>>> @@ -201,7 +219,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>>>>    #ifdef CONFIG_FSNOTIFY
>>>>            INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>>>    #endif
>>>> -    }
>>>> +    } else
>>>> +        goto out_mnt_nr_dec;
>>>> +
>>>>        return mnt;
>>>>    #ifdef CONFIG_SMP
>>>> @@ -212,6 +232,9 @@ out_free_id:
>>>>        mnt_free_id(mnt);
>>>>    out_free_cache:
>>>>        kmem_cache_free(mnt_cache, mnt);
>>>> +out_mnt_nr_dec:
>>>> +    if (ve)
>>>> +        atomic_dec(&ve->mnt_nr);
>>>>        return NULL;
>>>>    }
>>>> @@ -542,6 +565,13 @@ int sb_prepare_remount_readonly(struct
>>>> super_block *sb)
>>>>    static void free_vfsmnt(struct mount *mnt)
>>>>    {
>>>> +    struct ve_struct *ve = get_ve_by_id(mnt->mnt_owner_veid);
>>>> +
>>>> +    if (ve) {
>>>> +        atomic_dec(&ve->mnt_nr);
>>>> +        put_ve(ve);
>>>> +    }
>>>> +
>>>>        kfree(mnt->mnt_devname);
>>>>        mnt_free_id(mnt);
>>>>    #ifdef CONFIG_SMP
>>>> diff --git a/include/linux/mnt_namespace.h
>>>> b/include/linux/mnt_namespace.h
>>>> index 12b2ab5..4d83d3c 100644
>>>> --- a/include/linux/mnt_namespace.h
>>>> +++ b/include/linux/mnt_namespace.h
>>>> @@ -14,5 +14,7 @@ extern const struct file_operations
>>>> proc_mounts_operations;
>>>>    extern const struct file_operations proc_mountinfo_operations;
>>>>    extern const struct file_operations proc_mountstats_operations;
>>>> +extern unsigned int sysctl_ve_mount_nr;
>>>> +
>>>>    #endif
>>>>    #endif
>>>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>>>> index 86b95c3..c6530c2 100644
>>>> --- a/include/linux/ve.h
>>>> +++ b/include/linux/ve.h
>>>> @@ -128,6 +128,7 @@ struct ve_struct {
>>>>        unsigned long        aio_nr;
>>>>        unsigned long        aio_max_nr;
>>>>    #endif
>>>> +    atomic_t        mnt_nr;
>>>>    };
>>>>    struct ve_devmnt {
>>>> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
>>>> index 0f5898e..3894c60 100644
>>>> --- a/include/linux/ve_proto.h
>>>> +++ b/include/linux/ve_proto.h
>>>> @@ -32,6 +32,7 @@ static inline bool ve_is_super(struct ve_struct *ve)
>>>>    #define get_exec_env()        (current->task_ve)
>>>>    #define get_env_init(ve)    (ve->ve_ns->pid_ns->child_reaper)
>>>> +#define get_veid(ve)        (ve->veid)
>>>>    const char *ve_name(struct ve_struct *ve);
>>>> @@ -124,6 +125,11 @@ static inline struct ve_struct *get_exec_env(void)
>>>>    #define get_env_init(ve)    (ve->ve_ns->pid_ns->child_reaper)
>>>> +static inline envid_t get_veid(struct ve_struct *ve)
>>>> +{
>>>> +    return 0;
>>>> +}
>>>> +
>>>>    static inline bool ve_is_super(struct ve_struct *ve)
>>>>    {
>>>>        return true;
>>>> @@ -139,6 +145,11 @@ static inline const char *task_ve_name(struct
>>>> task_struct *task)
>>>>        return "0";
>>>>    }
>>>> +static inline struct ve_struct *get_ve_by_id(envid_t veid)
>>>> +{
>>>> +    return NULL;
>>>> +}
>>>> +
>>>>    #define nr_threads_ve(ve)    (nr_threads)
>>>>    #endif /* CONFIG_VE */
>>>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>>>> index 9c081e3..ba69200 100644
>>>> --- a/kernel/sysctl.c
>>>> +++ b/kernel/sysctl.c
>>>> @@ -64,6 +64,7 @@
>>>>    #include <linux/sched/sysctl.h>
>>>>    #include <linux/kexec.h>
>>>>    #include <linux/ve.h>
>>>> +#include <linux/mnt_namespace.h>
>>>>    #include <asm/uaccess.h>
>>>>    #include <asm/processor.h>
>>>> @@ -1740,6 +1741,13 @@ static struct ctl_table fs_table[] = {
>>>>            .proc_handler    = &pipe_proc_fn,
>>>>            .extra1        = &pipe_min_size,
>>>>        },
>>>> +    {
>>>> +        .procname    = "ve-mount-nr",
>>>> +        .data        = &sysctl_ve_mount_nr,
>>>> +        .maxlen        = sizeof(sysctl_ve_mount_nr),
>>>> +        .mode        = 0644,
>>>> +        .proc_handler    = proc_dointvec,
>>>> +    },
>>>>        { }
>>>>    };
>>>
>>> And, actually, the final question: is sysctl the best way to solve this
>>> task?
>>> This magic "4096" number of mounts also looks odd.
>>> Maybe it makes sense to make this feature per-ve, and place it's handler
>>> not to sysctls, but to ve cgroup control files?
>>>
>>>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>>>> index e9219e6..216608c 100644
>>>> --- a/kernel/ve/ve.c
>>>> +++ b/kernel/ve/ve.c
>>>> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>>>>    #endif
>>>>        .sched_lat_ve.cur    = &ve0_lat_stats,
>>>>        .init_cred        = &init_cred,
>>>> +    .mnt_nr            = ATOMIC_INIT(0),
>>>>    };
>>>>    EXPORT_SYMBOL(ve0);
>>>> @@ -653,6 +654,7 @@ do_init:
>>>>        ve->aio_nr = 0;
>>>>        ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>>>>    #endif
>>>> +    atomic_set(&ve->mnt_nr, 0);
>>>>        return &ve->css;
>>>
>>> .
>>>
>>
>> Regards,
>> Evgenii
>>
>>
> .
>