[Devel] [PATCH] ve/fs: add per-VE limit of mount points
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Nov 5 07:45:42 PST 2015
Zenya,
1) an idea from Volodya Davydov: we can store the pointer to ve_struct, not the veid,
thus we'll eliminate possible races with veid reuse.
2) there are 2 more comments from Stas (3 total), please take a look.
i don't mind leaving it as a global sysctl because hopefully it will never used =>
less time spent to rework. And we currently really don't need it to be per-CT.
--
Best regards,
Konstantin Khorenko,
Virtuozzo Linux Kernel Team
On 11/05/2015 10:35 AM, Evgenii Shatokhin wrote:
> 02.11.2015 18:47, Stanislav Kinsburskiy пишет:
>> Yet again these sysctl handles...(((
>> I have some questions.
>> Please, see inlined.
>>
>> 02.11.2015 16:11, Evgenii Shatokhin пишет:
>>> https://jira.sw.ru/browse/PSBM-34438
>>>
>>> (This fix was adapted from PCS6.)
>>>
>>> It is possible for a container to create lots of mount points, which may
>>> make operations with them slower. As some of these operations take
>>> global locks (namespace_sem, vfsmount_lock), it might affect other
>>> containers as well.
>>>
>>> Let us limit the maximum number of mount points a VE may create. The
>>> limit can be customized via /proc/sys/fs/ve-mount-nr knob.
>>>
>>> Signed-off-by: Evgenii Shatokhin <eshatokhin at odin.com>
>>> ---
>>> fs/mount.h | 2 ++
>>> fs/namespace.c | 34 ++++++++++++++++++++++++++++++++--
>>> include/linux/mnt_namespace.h | 2 ++
>>> include/linux/ve.h | 1 +
>>> include/linux/ve_proto.h | 11 +++++++++++
>>> kernel/sysctl.c | 8 ++++++++
>>> kernel/ve/ve.c | 2 ++
>>> 7 files changed, 58 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/mount.h b/fs/mount.h
>>> index 3e16b57..e192ff9 100644
>>> --- a/fs/mount.h
>>> +++ b/fs/mount.h
>>> @@ -50,6 +50,8 @@ struct mount {
>>> int mnt_expiry_mark; /* true if marked for expiry */
>>> int mnt_pinned;
>>> int mnt_ghosts;
>>> +
>>> + envid_t mnt_owner_veid;
>>> };
>>
>> Do we really need it? Can't we just take get_exec_env() on mount and
>> umount?
>> Yes, in this case we won't account mount-umount operations from ve0.
>> But maybe we could just do not care about them?
>> If yes, then we can avoid adding another field to struct mount (which
>> would be great, btw).
>
> Good point. Personally, I would rather leave struct mount alone.
> However, I am not 100% sure yet it is safe.
>
> Is it possible for mount to be called from a VE and umount from VE0? If
> so, then without that field, ve->mnt_nr would not be decremented during
> free_vfsmnt() and would become unbalanced.
>
> If such conditions are impossible, then yes, I suppose, we can do
> without additional fields in struct mount.
>
> I introduced it only because the patch for PCS6 I backported used this
> approach.
>
> Perhaps, the authors of that patch could shed some light?
>
>>
>>> #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any
>>> mnt_namespace */
>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>> index 8909c13..be80e8d 100644
>>> --- a/fs/namespace.c
>>> +++ b/fs/namespace.c
>>> @@ -42,6 +42,14 @@ static struct list_head
>>> mountpoint_hashtable[HASH_SIZE];
>>> static struct kmem_cache *mnt_cache __read_mostly;
>>> static struct rw_semaphore namespace_sem;
>>> +/*
>>> + * Operations with a big amount of mount points can require a lot of
>>> time.
>>> + * These operations take the global lock namespace_sem, so they can
>>> affect
>>> + * other containers. Let us allow no more than sysctl_ve_mount_nr mount
>>> + * points for a VE.
>>> + */
>>> +unsigned int sysctl_ve_mount_nr = 4096;
>>> +
>>
>> Defining sysctl to fs/namespace.c looks unnecessary.
>> The same apply to adding sysctl handling in kernel/sysctl.c below.
>> Since it's anyway a global variable, can't we place it in /proc/sys/fs/
>> with other ve sysctl's (vz_sysfs_table)?
>>
>>> /* /sys/fs */
>>> struct kobject *fs_kobj;
>>> EXPORT_SYMBOL_GPL(fs_kobj);
>>> @@ -165,7 +173,15 @@ unsigned int mnt_get_count(struct mount *mnt)
>>> static struct mount *alloc_vfsmnt(const char *name)
>>> {
>>> - struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>> + struct mount *mnt;
>>> + struct ve_struct *ve = get_exec_env();
>>> +
>>> + if (ve &&
>>> + atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
>>> + !ve_is_super(ve))
>>> + goto out_mnt_nr_dec;
>>> +
>>> + mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
>>> if (mnt) {
>>> int err;
>>> @@ -190,6 +206,8 @@ static struct mount *alloc_vfsmnt(const char *name)
>>> mnt->mnt_writers = 0;
>>> #endif
>>> + mnt->mnt_owner_veid = get_veid(ve);
>>> +
>>> INIT_LIST_HEAD(&mnt->mnt_hash);
>>> INIT_LIST_HEAD(&mnt->mnt_child);
>>> INIT_LIST_HEAD(&mnt->mnt_mounts);
>>> @@ -201,7 +219,9 @@ static struct mount *alloc_vfsmnt(const char *name)
>>> #ifdef CONFIG_FSNOTIFY
>>> INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>> #endif
>>> - }
>>> + } else
>>> + goto out_mnt_nr_dec;
>>> +
>>> return mnt;
>>> #ifdef CONFIG_SMP
>>> @@ -212,6 +232,9 @@ out_free_id:
>>> mnt_free_id(mnt);
>>> out_free_cache:
>>> kmem_cache_free(mnt_cache, mnt);
>>> +out_mnt_nr_dec:
>>> + if (ve)
>>> + atomic_dec(&ve->mnt_nr);
>>> return NULL;
>>> }
>>> @@ -542,6 +565,13 @@ int sb_prepare_remount_readonly(struct
>>> super_block *sb)
>>> static void free_vfsmnt(struct mount *mnt)
>>> {
>>> + struct ve_struct *ve = get_ve_by_id(mnt->mnt_owner_veid);
>>> +
>>> + if (ve) {
>>> + atomic_dec(&ve->mnt_nr);
>>> + put_ve(ve);
>>> + }
>>> +
>>> kfree(mnt->mnt_devname);
>>> mnt_free_id(mnt);
>>> #ifdef CONFIG_SMP
>>> diff --git a/include/linux/mnt_namespace.h
>>> b/include/linux/mnt_namespace.h
>>> index 12b2ab5..4d83d3c 100644
>>> --- a/include/linux/mnt_namespace.h
>>> +++ b/include/linux/mnt_namespace.h
>>> @@ -14,5 +14,7 @@ extern const struct file_operations
>>> proc_mounts_operations;
>>> extern const struct file_operations proc_mountinfo_operations;
>>> extern const struct file_operations proc_mountstats_operations;
>>> +extern unsigned int sysctl_ve_mount_nr;
>>> +
>>> #endif
>>> #endif
>>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>>> index 86b95c3..c6530c2 100644
>>> --- a/include/linux/ve.h
>>> +++ b/include/linux/ve.h
>>> @@ -128,6 +128,7 @@ struct ve_struct {
>>> unsigned long aio_nr;
>>> unsigned long aio_max_nr;
>>> #endif
>>> + atomic_t mnt_nr;
>>> };
>>> struct ve_devmnt {
>>> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
>>> index 0f5898e..3894c60 100644
>>> --- a/include/linux/ve_proto.h
>>> +++ b/include/linux/ve_proto.h
>>> @@ -32,6 +32,7 @@ static inline bool ve_is_super(struct ve_struct *ve)
>>> #define get_exec_env() (current->task_ve)
>>> #define get_env_init(ve) (ve->ve_ns->pid_ns->child_reaper)
>>> +#define get_veid(ve) (ve->veid)
>>> const char *ve_name(struct ve_struct *ve);
>>> @@ -124,6 +125,11 @@ static inline struct ve_struct *get_exec_env(void)
>>> #define get_env_init(ve) (ve->ve_ns->pid_ns->child_reaper)
>>> +static inline envid_t get_veid(struct ve_struct *ve)
>>> +{
>>> + return 0;
>>> +}
>>> +
>>> static inline bool ve_is_super(struct ve_struct *ve)
>>> {
>>> return true;
>>> @@ -139,6 +145,11 @@ static inline const char *task_ve_name(struct
>>> task_struct *task)
>>> return "0";
>>> }
>>> +static inline struct ve_struct *get_ve_by_id(envid_t veid)
>>> +{
>>> + return NULL;
>>> +}
>>> +
>>> #define nr_threads_ve(ve) (nr_threads)
>>> #endif /* CONFIG_VE */
>>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>>> index 9c081e3..ba69200 100644
>>> --- a/kernel/sysctl.c
>>> +++ b/kernel/sysctl.c
>>> @@ -64,6 +64,7 @@
>>> #include <linux/sched/sysctl.h>
>>> #include <linux/kexec.h>
>>> #include <linux/ve.h>
>>> +#include <linux/mnt_namespace.h>
>>> #include <asm/uaccess.h>
>>> #include <asm/processor.h>
>>> @@ -1740,6 +1741,13 @@ static struct ctl_table fs_table[] = {
>>> .proc_handler = &pipe_proc_fn,
>>> .extra1 = &pipe_min_size,
>>> },
>>> + {
>>> + .procname = "ve-mount-nr",
>>> + .data = &sysctl_ve_mount_nr,
>>> + .maxlen = sizeof(sysctl_ve_mount_nr),
>>> + .mode = 0644,
>>> + .proc_handler = proc_dointvec,
>>> + },
>>> { }
>>> };
>>
>> And, actually, the final question: is sysctl the best way to solve this
>> task?
>> This magic "4096" number of mounts also looks odd.
>> Maybe it makes sense to make this feature per-ve, and place it's handler
>> not to sysctls, but to ve cgroup control files?
>>
>>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>>> index e9219e6..216608c 100644
>>> --- a/kernel/ve/ve.c
>>> +++ b/kernel/ve/ve.c
>>> @@ -82,6 +82,7 @@ struct ve_struct ve0 = {
>>> #endif
>>> .sched_lat_ve.cur = &ve0_lat_stats,
>>> .init_cred = &init_cred,
>>> + .mnt_nr = ATOMIC_INIT(0),
>>> };
>>> EXPORT_SYMBOL(ve0);
>>> @@ -653,6 +654,7 @@ do_init:
>>> ve->aio_nr = 0;
>>> ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
>>> #endif
>>> + atomic_set(&ve->mnt_nr, 0);
>>> return &ve->css;
>>
>> .
>>
>
> Regards,
> Evgenii
>
>
More information about the Devel
mailing list