[Devel] [PATCH RHEL7 COMMIT] ve/fs: add per-VE limit of mount points
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Dec 28 02:37:10 PST 2015
The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.9.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.9.17
------>
commit f0d09504e70598e049110e303b070dd63b52d329
Author: Evgenii Shatokhin <eshatokhin at odin.com>
Date: Mon Dec 28 14:37:10 2015 +0400
ve/fs: add per-VE limit of mount points
https://jira.sw.ru/browse/PSBM-34438
(This fix was adapted from PCS6.)
It is possible for a container to create lots of mount points, which may
make operations with them slower. As some of these operations take
global locks (namespace_sem, vfsmount_lock), it might affect other
containers as well.
Let us limit the maximum number of mount points a VE may create. The
limit can be customized via /proc/sys/fs/ve-mount-nr knob.
Changes in v.3:
* Revisited VE-specific parts of the patch to reduce the impact on the
generic code.
Changes in v.2:
* The situations where VE0 mounts something and another VE unmounts it
seem to be of no concern. If so, it is OK not to alter struct mount:
the mount counter for a VE may become unbalanced but this is
acceptable here.
* The sysctl knob is now defined alongside other VE sysctls.
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
Acked-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Testing results:
This "mount bomb" ends with "cannot allocate memory" and makes no harm:
https://github.com/avagin/ctb/blob/master/mounts/run.sh
---
fs/namespace.c | 9 ++++++++-
include/linux/ve.h | 27 +++++++++++++++++++++++++++
kernel/ve/ve.c | 2 ++
kernel/ve/veowner.c | 15 +++++++++++++++
4 files changed, 52 insertions(+), 1 deletion(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index c051e42..fc8ea36 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -165,7 +165,12 @@ unsigned int mnt_get_count(struct mount *mnt)
static struct mount *alloc_vfsmnt(const char *name)
{
- struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ struct mount *mnt;
+
+ if (!ve_mount_allowed())
+ return NULL;
+
+ mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
@@ -202,6 +207,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
}
+ ve_mount_nr_inc();
return mnt;
#ifdef CONFIG_SMP
@@ -542,6 +548,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ ve_mount_nr_dec();
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
#ifdef CONFIG_SMP
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 87450c1..ee299e3 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -130,6 +130,10 @@ struct ve_struct {
unsigned long aio_nr;
unsigned long aio_max_nr;
#endif
+ /* Number of mounts. May become unbalanced if VE0 mounts something
+ * and the VE unmounts it. This is acceptable.
+ */
+ int mnt_nr;
};
struct ve_devmnt {
@@ -147,6 +151,8 @@ extern int nr_ve;
extern struct proc_dir_entry *proc_vz_dir;
extern struct cgroup_subsys ve_subsys;
+extern unsigned int sysctl_ve_mount_nr;
+
#ifdef CONFIG_VE_IPTABLES
extern __u64 ve_setup_iptables_mask(__u64 init_mask);
#endif
@@ -226,6 +232,23 @@ extern struct tty_driver *vtty_console_driver(int *index);
extern int vtty_open_master(envid_t veid, int idx);
#endif /* CONFIG_TTY */
+static inline int ve_mount_allowed(void)
+{
+ struct ve_struct *ve = get_exec_env();
+
+ return ve_is_super(ve) || ve->mnt_nr < sysctl_ve_mount_nr;
+}
+
+static inline void ve_mount_nr_inc(void)
+{
+ get_exec_env()->mnt_nr++;
+}
+
+static inline void ve_mount_nr_dec(void)
+{
+ get_exec_env()->mnt_nr--;
+}
+
#else /* CONFIG_VE */
#define ve_uevent_seqnum uevent_seqnum
@@ -263,6 +286,10 @@ static inline void monotonic_abs_to_ve(clockid_t which_clock,
struct timespec *tp) { }
static inline void monotonic_ve_to_abs(clockid_t which_clock,
struct timepsec *tp) { }
+
+static inline int ve_mount_allowed(void) { return 1; }
+static inline void ve_mount_nr_inc(void) { }
+static inline void ve_mount_nr_dec(void) { }
#endif /* CONFIG_VE */
#endif /* _LINUX_VE_H */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 8f5f905..1f8e5da 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -82,6 +82,7 @@ struct ve_struct ve0 = {
#endif
.sched_lat_ve.cur = &ve0_lat_stats,
.init_cred = &init_cred,
+ .mnt_nr = 0,
};
EXPORT_SYMBOL(ve0);
@@ -662,6 +663,7 @@ do_init:
ve->aio_nr = 0;
ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
#endif
+ ve->mnt_nr = 0;
return &ve->css;
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
index 316e4d0..1a7e735 100644
--- a/kernel/ve/veowner.c
+++ b/kernel/ve/veowner.c
@@ -55,6 +55,14 @@ static void prepare_proc(void)
int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT;
static int ve_area_access_check;
+/*
+ * Operations with a big amount of mount points can require a lot of time.
+ * These operations take the global lock namespace_sem, so they can affect
+ * other containers. Let us allow no more than sysctl_ve_mount_nr mount
+ * points for a VE.
+ */
+unsigned int sysctl_ve_mount_nr = 4096;
+
static struct ctl_table vz_fs_table[] = {
{
.procname = "ve-area-access-check",
@@ -77,6 +85,13 @@ static struct ctl_table vz_fs_table[] = {
.mode = 0644 | S_ISVTX,
.proc_handler = &proc_dointvec_virtual,
},
+ {
+ .procname = "ve-mount-nr",
+ .data = &sysctl_ve_mount_nr,
+ .maxlen = sizeof(sysctl_ve_mount_nr),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ 0 }
};
More information about the Devel
mailing list