[Devel] [PATCH 07/13] ve/fs: add per-VE limit of mount points
Alexander Mikhalitsyn
alexander.mikhalitsyn at virtuozzo.com
Wed Apr 14 10:57:54 MSK 2021
From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
https://jira.sw.ru/browse/PSBM-34438
(This fix was adapted from PCS6.)
It is possible for a container to create lots of mount points, which may
make operations with them slower. As some of these operations take
global locks (namespace_sem, vfsmount_lock), it might affect other
containers as well.
Let us limit the maximum number of mount points a VE may create. The
limit can be customized via /proc/sys/fs/ve-mount-nr knob.
Changes in v.3:
* Revisited VE-specific parts of the patch to reduce the impact on the
generic code.
Changes in v.2:
* The situations where VE0 mounts something and another VE unmounts it
seem to be of no concern. If so, it is OK not to alter struct mount:
the mount counter for a VE may become unbalanced but this is
acceptable here.
* The sysctl knob is now defined alongside other VE sysctls.
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
Acked-by: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Testing results:
This "mount bomb" ends with "cannot allocate memory" and makes no harm:
https://github.com/avagin/ctb/blob/master/mounts/run.sh
=====================================
ve/fs: Print a message if a VE reaches its limit on mounts
This should help debugging the situations when alloc_vfsmnt() fails.
Rate-limiting is used to avoid the flood of such messages if the mount
limit is hit often.
v2: Print the name of the VE rather than its address.
Suggested in https://jira.sw.ru/browse/PSBM-42825
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
=====================================
ve/fs: Use comparison of signed integers in ve_mount_allowed()
ve->mnr_nr is a signed integer which may become negative in some cases:
* if the increments and decrements of ve->mnr_nr race against each other
and the decrements win;
* if something is mounted by one VE but unmounted by another VE.
sysctl_ve_mount_nr is unsigned.
So the comparison (ve->mnr_nr < sysctl_ve_mount_nr) can actually be
compiled as ((unsigned int)ve->mnr_nr < sysctl_ve_mount_nr).
ve_mount_allowed() would return 0 in that case and the mount operation
would fail as a result.
This patch fixes the problem.
https://jira.sw.ru/browse/PSBM-42825
Signed-off-by: Evgenii Shatokhin <eshatokhin at virtuozzo.com>
==========================================
ve/fs: convert per-VE number of currently mounted fs to atomic_t
At the moment ve->mnt_nr is not atomic and not guarded by any lock =>
convert it to atomic_t in order to avoid races on updates
https://jira.sw.ru/browse/PSBM-69880
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
==========================================
ve/fs: limit "fs.ve-mount-nr" sysctl with INT_MAX
sysctl "fs.ve-mount-nr" is unsigned int and is casted to "int" while
comparing values => if we set it to a value > INT_MAX, VE won't be able to
mount anything after that.
=> set a max value for sysctl == INT_MAX
https://jira.sw.ru/browse/PSBM-69880
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
===========================================
ve/fs/mount: pin mnt to a VE for correct number of mounts accounting
If a mount is done in one VE context and umount - in another,
counters ve->mnt_nr became unbalanced and this can cause denial
of new mounts due to per-VE numbre of mounts limit.
Additionally move related functions to fs/namespace.c to avoid adding
fs/mount.h into ve.h:
ve_mount_nr_inc()
ve_mount_nr_dec()
ve_mount_allowed()
Example: mount is done from inside a CT, umount - from host =>
ve->mnt_nr gets incorrectly increased for a CT.
https://jira.sw.ru/browse/PSBM-69880
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
===========================================
VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127782
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
fs/mount.h | 3 +++
fs/namespace.c | 50 +++++++++++++++++++++++++++++++++++++++++++--
include/linux/ve.h | 5 +++++
kernel/ve/ve.c | 3 +++
kernel/ve/veowner.c | 21 ++++++++++++++++++-
5 files changed, 79 insertions(+), 3 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index b077957509ca..e3e0379d5c47 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -60,6 +60,9 @@ struct mount {
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct list_head mnt_umounting; /* list entry for umount propagation */
+#ifdef CONFIG_VE
+ struct ve_struct *ve_owner; /* VE in which this mount was created */
+#endif /* CONFIG_VE */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 07c63c485940..7866456cbad4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -177,9 +177,22 @@ static void drop_mountpoint(struct fs_pin *p)
mntput(&m->mnt);
}
+static inline int ve_mount_allowed(void);
+static inline void ve_mount_nr_inc(struct mount *mnt);
+static inline void ve_mount_nr_dec(struct mount *mnt);
+
static struct mount *alloc_vfsmnt(const char *name)
{
- struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ struct mount *mnt;
+
+ if (!ve_mount_allowed()) {
+ pr_warn_ratelimited(
+ "CT#%s reached the limit on mounts.\n",
+ ve_name(get_exec_env()));
+ return NULL;
+ }
+
+ mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
@@ -215,6 +228,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+ ve_mount_nr_inc(mnt);
}
return mnt;
@@ -555,6 +569,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ ve_mount_nr_dec(mnt);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -2493,7 +2508,38 @@ int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remou
return err;
}
-#endif
+
+static inline int ve_mount_allowed(void)
+{
+ struct ve_struct *ve = get_exec_env();
+
+ return ve_is_super(ve) ||
+ atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
+}
+
+static inline void ve_mount_nr_inc(struct mount *mnt)
+{
+ struct ve_struct *ve = get_exec_env();
+
+ mnt->ve_owner = get_ve(ve);
+ atomic_inc(&ve->mnt_nr);
+}
+
+static inline void ve_mount_nr_dec(struct mount *mnt)
+{
+ struct ve_struct *ve = mnt->ve_owner;
+
+ atomic_dec(&ve->mnt_nr);
+ put_ve(ve);
+ mnt->ve_owner = NULL;
+}
+
+#else /* CONFIG_VE */
+
+static inline int ve_mount_allowed(void) { return 1; }
+static inline void ve_mount_nr_inc(struct mount *mnt) { }
+static inline void ve_mount_nr_dec(struct mount *mnt) { }
+#endif /* CONFIG_VE */
static int do_check_and_remount_sb(struct super_block *sb, int flags, void *data)
{
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 0f769a96e805..1276b0f98b2f 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -90,6 +90,9 @@ struct ve_struct {
struct task_struct *umh_task;
unsigned long meminfo_val;
+
+ atomic_t mnt_nr; /* number of present VE mounts */
+
#ifdef CONFIG_COREDUMP
char core_pattern[CORENAME_MAX_SIZE];
#endif
@@ -143,6 +146,8 @@ extern int nr_ve;
#define capable_setveid() \
(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
+extern unsigned int sysctl_ve_mount_nr;
+
#ifdef CONFIG_VE
void ve_add_to_release_list(struct cgroup *cgrp);
void ve_rm_from_release_list(struct cgroup *cgrp);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ecc02cd3c8f3..ef2b4c670587 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -72,6 +72,7 @@ struct ve_struct ve0 = {
#ifdef CONFIG_VE_IPTABLES
.ipt_mask = VE_IP_ALL, /* everything is allowed */
#endif
+ .mnt_nr = ATOMIC_INIT(0),
.netns_avail_nr = ATOMIC_INIT(INT_MAX),
.netns_max_nr = INT_MAX,
.meminfo_val = VE_MEMINFO_SYSTEM,
@@ -881,6 +882,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
INIT_LIST_HEAD(&ve->ve_list);
kmapset_init_key(&ve->sysfs_perms_key);
+ atomic_set(&ve->mnt_nr, 0);
+
#ifdef CONFIG_COREDUMP
strcpy(ve->core_pattern, "core");
#endif
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
index 2dfa9920da40..926a479f1e2a 100644
--- a/kernel/ve/veowner.c
+++ b/kernel/ve/veowner.c
@@ -65,6 +65,16 @@ static void prepare_proc(void)
* ------------------------------------------------------------------------
*/
+/*
+ * Operations with a big amount of mount points can require a lot of time.
+ * These operations take the global lock namespace_sem, so they can affect
+ * other containers. Let us allow no more than sysctl_ve_mount_nr mount
+ * points for a VE.
+ */
+unsigned int sysctl_ve_mount_nr = 4096;
+static int ve_mount_nr_min = 0;
+static int ve_mount_nr_max = INT_MAX;
+
static struct ctl_table vz_fs_table[] = {
{
.procname = "fsync-enable",
@@ -73,7 +83,16 @@ static struct ctl_table vz_fs_table[] = {
.mode = 0644 | S_ISVTX,
.proc_handler = &proc_dointvec_virtual,
},
- { }
+ {
+ .procname = "ve-mount-nr",
+ .data = &sysctl_ve_mount_nr,
+ .maxlen = sizeof(sysctl_ve_mount_nr),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ve_mount_nr_min,
+ .extra2 = &ve_mount_nr_max,
+ },
+ { 0 }
};
static struct ctl_path fs_path[] = {
--
2.28.0
More information about the Devel
mailing list