[Devel] [PATCH VZ10 3/6] ve/fs: Rework per-ve mount count
Vladimir Riabchun
vladimir.riabchun at virtuozzo.com
Tue Jun 30 20:13:29 MSK 2026
Previous approach with current mounts counter had an issue:
there was a gap between ve_mount_allowed check and ve_mount_nr_inc,
which could allow CT to have more mounts than expected.
Fix this by tracking the number of available mounts instead
of current ones. This also makes resources accounting
more consistent - we are using ***_avail_nr approach more.
One more issue with inconsistent ve value is fixed:
ve_mount_allowed always used ve from get_exec_env, but
ve_mount_nr_inc operated with owner_ve.
Now actual ve value is calculated in the beginning of alloc_vfsmnt.
ve_ns_owner_test is updated as well.
https://virtuozzo.atlassian.net/browse/VSTOR-135520
Feature: per-ve failcounters
Signed-off-by: Vladimir Riabchun <vladimir.riabchun at virtuozzo.com>
---
fs/namespace.c | 52 ++++++++++---------
include/linux/ve.h | 2 +-
kernel/ve/ve.c | 12 ++---
tools/testing/selftests/ve/ve_ns_owner_test.c | 29 ++++++-----
4 files changed, 49 insertions(+), 46 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index d2fa363b9a56..32452898aadb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -317,15 +317,18 @@ int mnt_get_count(struct mount *mnt)
#endif
}
-static inline int ve_mount_allowed(void);
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve);
-static inline void ve_mount_nr_dec(struct mount *mnt);
+static inline int ve_try_reserve_mount(struct ve_struct *ve);
+static inline void ve_mount_put(struct mount *mnt);
static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
{
struct mount *mnt;
+ struct ve_struct *ve = owner_ve;
- if (!ve_mount_allowed()) {
+ if (!ve)
+ ve = get_exec_env();
+
+ if (!ve_try_reserve_mount(ve)) {
pr_warn_ratelimited(
"CT#%s reached the limit on mounts.\n",
ve_name(get_exec_env()));
@@ -370,7 +373,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
- ve_mount_nr_inc(mnt, owner_ve);
+ /* Got ve reference in ve_try_reserve_mount */
+ mnt->ve_owner = ve;
+ } else {
+ goto out_nomem;
}
return mnt;
@@ -382,6 +388,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
mnt_free_id(mnt);
out_free_cache:
kmem_cache_free(mnt_cache, mnt);
+out_nomem:
+ /* Got ve reference in ve_try_reserve_mount */
+ atomic_inc(&ve->mnt_avail_nr);
+ put_ve(ve);
return NULL;
}
@@ -750,7 +760,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
mnt_idmap_put(mnt_idmap(&mnt->mnt));
- ve_mount_nr_dec(mnt);
+ ve_mount_put(mnt);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -3258,28 +3268,21 @@ int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remou
return err;
}
-static inline int ve_mount_allowed(void)
+static inline int ve_try_reserve_mount(struct ve_struct *ve)
{
- struct ve_struct *ve = get_exec_env();
-
- return ve_is_super(ve) || ve->is_pseudosuper ||
- atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
-}
-
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve)
-{
- if (!ve)
- ve = get_exec_env();
-
- mnt->ve_owner = get_ve(ve);
- atomic_inc(&ve->mnt_nr);
+ int ret = ve_is_super(ve) || ve->is_pseudosuper ||
+ atomic_dec_if_positive(&ve->mnt_avail_nr) >= 0;
+ if (ret)
+ get_ve(ve);
+ return ret;
}
-static inline void ve_mount_nr_dec(struct mount *mnt)
+static inline void ve_mount_put(struct mount *mnt)
{
struct ve_struct *ve = mnt->ve_owner;
- atomic_dec(&ve->mnt_nr);
+ if (!ve_is_super(ve) && !ve->is_pseudosuper)
+ atomic_inc(&ve->mnt_avail_nr);
put_ve(ve);
mnt->ve_owner = NULL;
}
@@ -3303,9 +3306,8 @@ bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
#else /* CONFIG_VE */
-static inline int ve_mount_allowed(void) { return 1; }
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve) { }
-static inline void ve_mount_nr_dec(struct mount *mnt) { }
+static inline int ve_try_reserve_mount(struct ve_struct *ve) { return 1; }
+static inline void ve_mount_put(struct mount *mnt) { }
#endif /* CONFIG_VE */
static int ve_prepare_mount_options(struct fs_context *fc, void *data)
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 5bbdbd3d5aa6..b2b3d4c12e39 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -87,7 +87,7 @@ struct ve_struct {
atomic_t nd_neigh_nr;
unsigned long meminfo_val;
- atomic_t mnt_nr; /* number of present VE mounts */
+ atomic_t mnt_avail_nr; /* number of present VE mounts */
#ifdef CONFIG_COREDUMP
char core_pattern[CORENAME_MAX_SIZE];
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 56a869cec237..0fb5a2c1f63a 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -78,7 +78,7 @@ struct ve_struct ve0 = {
.arp_neigh_nr = ATOMIC_INIT(0),
.nd_neigh_nr = ATOMIC_INIT(0),
- .mnt_nr = ATOMIC_INIT(0),
+ .mnt_avail_nr = ATOMIC_INIT(INT_MAX),
.meminfo_val = VE_MEMINFO_SYSTEM,
.umh_running_helpers = ATOMIC_INIT(0),
.umh_helpers_waitq = __WAIT_QUEUE_HEAD_INITIALIZER(ve0.umh_helpers_waitq),
@@ -774,7 +774,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
atomic_set(&ve->arp_neigh_nr, 0);
atomic_set(&ve->nd_neigh_nr, 0);
- atomic_set(&ve->mnt_nr, 0);
+ atomic_set(&ve->mnt_avail_nr, sysctl_ve_mount_nr);
#ifdef CONFIG_COREDUMP
strcpy(ve->core_pattern, "core");
@@ -1048,9 +1048,9 @@ static u64 ve_netns_avail_nr_read(struct cgroup_subsys_state *css, struct cftype
return atomic_read(&css_to_ve(css)->netns_avail_nr);
}
-static u64 ve_mnt_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static s64 ve_mnt_avail_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return atomic_read(&css_to_ve(css)->mnt_nr);
+ return atomic_read(&css_to_ve(css)->mnt_avail_nr);
}
static u64 ve_netif_max_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1610,8 +1610,8 @@ static struct cftype ve_cftypes[] = {
.read_u64 = ve_netns_avail_nr_read,
},
{
- .name = "mnt_nr",
- .read_u64 = ve_mnt_nr_read,
+ .name = "mnt_avail_nr",
+ .read_s64 = ve_mnt_avail_nr_read,
},
{
.name = "netif_max_nr",
diff --git a/tools/testing/selftests/ve/ve_ns_owner_test.c b/tools/testing/selftests/ve/ve_ns_owner_test.c
index b5b4f10be72b..3b5979fa63f6 100644
--- a/tools/testing/selftests/ve/ve_ns_owner_test.c
+++ b/tools/testing/selftests/ve/ve_ns_owner_test.c
@@ -18,7 +18,7 @@
* with a new netns and mntns, the parent reads the new ve's counters
* via cgroupfs and asserts they reflect the just-created namespaces:
* - ve.netns_avail_nr drops by exactly one (the new netns);
- * - ve.mnt_nr is strictly greater than zero (mounts copied into the
+ * - ve.mnt_avail_nr drops by exactly one (mounts copied into the
* new mntns are accounted to the new ve).
*
* We never assert against the parent ve's counters: those are shared
@@ -56,6 +56,7 @@
* any spurious accounting against the parent ve would overflow it.
*/
#define VE_NETNS_MAX 3
+#define VE_MOUNTS_MAX 4096
static int write_file_at(int dirfd, const char *path, const char *val)
{
@@ -187,7 +188,7 @@ static int clone_child_func(void *arg)
/*
* Before fix:
* - clone path: ve.netns_avail_nr stays at VE_NETNS_MAX and
- * ve.mnt_nr stays at 0 because copy_net_ns()/copy_mnt_ns()
+ * ve.mnt_avail_nr stays at VE_MOUNTS_MAX because copy_net_ns()/copy_mnt_ns()
* charged the parent ve via get_exec_env().
* - unshare path: the syscall itself returned -EINVAL, so this
* check was unreachable.
@@ -198,16 +199,16 @@ static int clone_child_func(void *arg)
static void check_new_ve_owner(struct __test_metadata *_metadata,
int cgv2_fd, int ctid)
{
- unsigned long long avail, mnt;
+ unsigned long long avail_netns, avail_mnt;
char path[64];
snprintf(path, sizeof(path), "%d/ve.netns_avail_nr", ctid);
- ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail), 0);
- EXPECT_EQ(avail, VE_NETNS_MAX - 1);
+ ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_netns), 0);
+ EXPECT_EQ(avail_netns, VE_NETNS_MAX - 1);
- snprintf(path, sizeof(path), "%d/ve.mnt_nr", ctid);
- ASSERT_EQ(read_u64_at(cgv2_fd, path, &mnt), 0);
- EXPECT_GT(mnt, 0);
+ snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", ctid);
+ ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_mnt), 0);
+ EXPECT_LT(avail_mnt, VE_MOUNTS_MAX);
}
FIXTURE(ve_ns_owner)
@@ -218,7 +219,7 @@ FIXTURE(ve_ns_owner)
FIXTURE_SETUP(ve_ns_owner)
{
- unsigned long long initial_mnt_nr;
+ unsigned long long initial_mnt_avail_nr;
char ctid_str[16];
char val[16];
char path[64];
@@ -268,13 +269,13 @@ FIXTURE_SETUP(ve_ns_owner)
/*
* The new ve cgroup has not been entered by anything yet, so its
- * mnt_nr counter must start at 0. Each test below verifies that
+ * mnt_avail_nr counter be VE_MOUNTS_MAX. Each test below verifies that
* the clone/unshare populates the new mntns under this ve, i.e.
- * mnt_nr rises strictly above zero.
+ * mnt_avail_nr strictly decreases.
*/
- snprintf(path, sizeof(path), "%d/ve.mnt_nr", self->ctid);
- ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_nr), 0);
- ASSERT_EQ(initial_mnt_nr, 0);
+ snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", self->ctid);
+ ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_avail_nr), 0);
+ ASSERT_EQ(initial_mnt_avail_nr, VE_MOUNTS_MAX);
};
FIXTURE_TEARDOWN(ve_ns_owner)
--
2.47.1
More information about the Devel
mailing list