[Devel] [PATCH VZ10 3/6] ve/fs: Rework per-ve mount count

Tue Jun 30 20:13:29 MSK 2026

Previous approach with current mounts counter had an issue:
there was a gap between ve_mount_allowed check and ve_mount_nr_inc,
which could allow CT to have more mounts than expected.

Fix this by tracking the number of available mounts instead
of current ones. This also makes resources accounting
more consistent - we are using ***_avail_nr approach more.

One more issue with inconsistent ve value is fixed:
ve_mount_allowed always used ve from get_exec_env, but
ve_mount_nr_inc operated with owner_ve.
Now actual ve value is calculated in the beginning of alloc_vfsmnt.

ve_ns_owner_test is updated as well.

https://virtuozzo.atlassian.net/browse/VSTOR-135520

Feature: per-ve failcounters
Signed-off-by: Vladimir Riabchun <vladimir.riabchun at virtuozzo.com>
---
 fs/namespace.c                                | 52 ++++++++++---------
 include/linux/ve.h                            |  2 +-
 kernel/ve/ve.c                                | 12 ++---
 tools/testing/selftests/ve/ve_ns_owner_test.c | 29 ++++++-----
 4 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index d2fa363b9a56..32452898aadb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -317,15 +317,18 @@ int mnt_get_count(struct mount *mnt)
 #endif
 }
 
-static inline int ve_mount_allowed(void);
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve);
-static inline void ve_mount_nr_dec(struct mount *mnt);
+static inline int ve_try_reserve_mount(struct ve_struct *ve);
+static inline void ve_mount_put(struct mount *mnt);
 
 static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
 {
 	struct mount *mnt;
+	struct ve_struct *ve = owner_ve;
 
-	if (!ve_mount_allowed()) {
+	if (!ve)
+		ve = get_exec_env();
+
+	if (!ve_try_reserve_mount(ve)) {
 		pr_warn_ratelimited(
 			"CT#%s reached the limit on mounts.\n",
 			ve_name(get_exec_env()));
@@ -370,7 +373,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
-		ve_mount_nr_inc(mnt, owner_ve);
+		/* Got ve reference in ve_try_reserve_mount */
+		mnt->ve_owner = ve;
+	} else {
+		goto out_nomem;
 	}
 	return mnt;
 
@@ -382,6 +388,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
+out_nomem:
+	/* Got ve reference in ve_try_reserve_mount */
+	atomic_inc(&ve->mnt_avail_nr);
+	put_ve(ve);
 	return NULL;
 }
 
@@ -750,7 +760,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
 	mnt_idmap_put(mnt_idmap(&mnt->mnt));
-	ve_mount_nr_dec(mnt);
+	ve_mount_put(mnt);
 	kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -3258,28 +3268,21 @@ int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remou
 	return err;
 }
 
-static inline int ve_mount_allowed(void)
+static inline int ve_try_reserve_mount(struct ve_struct *ve)
 {
-	struct ve_struct *ve = get_exec_env();
-
-	return ve_is_super(ve) || ve->is_pseudosuper ||
-		atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
-}
-
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve)
-{
-	if (!ve)
-		ve = get_exec_env();
-
-	mnt->ve_owner = get_ve(ve);
-	atomic_inc(&ve->mnt_nr);
+	int ret = ve_is_super(ve) || ve->is_pseudosuper ||
+		  atomic_dec_if_positive(&ve->mnt_avail_nr) >= 0;
+	if (ret)
+		get_ve(ve);
+	return ret;
 }
 
-static inline void ve_mount_nr_dec(struct mount *mnt)
+static inline void ve_mount_put(struct mount *mnt)
 {
 	struct ve_struct *ve = mnt->ve_owner;
 
-	atomic_dec(&ve->mnt_nr);
+	if (!ve_is_super(ve) && !ve->is_pseudosuper)
+		atomic_inc(&ve->mnt_avail_nr);
 	put_ve(ve);
 	mnt->ve_owner = NULL;
 }
@@ -3303,9 +3306,8 @@ bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
 
 #else /* CONFIG_VE */
 
-static inline int ve_mount_allowed(void) { return 1; }
-static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve) { }
-static inline void ve_mount_nr_dec(struct mount *mnt) { }
+static inline int ve_try_reserve_mount(struct ve_struct *ve) { return 1; }
+static inline void ve_mount_put(struct mount *mnt) { }
 #endif /* CONFIG_VE */
 
 static int ve_prepare_mount_options(struct fs_context *fc, void *data)
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 5bbdbd3d5aa6..b2b3d4c12e39 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -87,7 +87,7 @@ struct ve_struct {
 	atomic_t		nd_neigh_nr;
 	unsigned long		meminfo_val;
 
-	atomic_t		mnt_nr; /* number of present VE mounts */
+	atomic_t		mnt_avail_nr; /* number of present VE mounts */
 
 #ifdef CONFIG_COREDUMP
 	char			core_pattern[CORENAME_MAX_SIZE];
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 56a869cec237..0fb5a2c1f63a 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -78,7 +78,7 @@ struct ve_struct ve0 = {
 
 	.arp_neigh_nr		= ATOMIC_INIT(0),
 	.nd_neigh_nr		= ATOMIC_INIT(0),
-	.mnt_nr			= ATOMIC_INIT(0),
+	.mnt_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.meminfo_val		= VE_MEMINFO_SYSTEM,
 	.umh_running_helpers	= ATOMIC_INIT(0),
 	.umh_helpers_waitq	= __WAIT_QUEUE_HEAD_INITIALIZER(ve0.umh_helpers_waitq),
@@ -774,7 +774,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 
 	atomic_set(&ve->arp_neigh_nr, 0);
 	atomic_set(&ve->nd_neigh_nr, 0);
-	atomic_set(&ve->mnt_nr, 0);
+	atomic_set(&ve->mnt_avail_nr, sysctl_ve_mount_nr);
 
 #ifdef CONFIG_COREDUMP
 	strcpy(ve->core_pattern, "core");
@@ -1048,9 +1048,9 @@ static u64 ve_netns_avail_nr_read(struct cgroup_subsys_state *css, struct cftype
 	return atomic_read(&css_to_ve(css)->netns_avail_nr);
 }
 
-static u64 ve_mnt_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static s64 ve_mnt_avail_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-	return atomic_read(&css_to_ve(css)->mnt_nr);
+	return atomic_read(&css_to_ve(css)->mnt_avail_nr);
 }
 
 static u64 ve_netif_max_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -1610,8 +1610,8 @@ static struct cftype ve_cftypes[] = {
 		.read_u64		= ve_netns_avail_nr_read,
 	},
 	{
-		.name			= "mnt_nr",
-		.read_u64		= ve_mnt_nr_read,
+		.name			= "mnt_avail_nr",
+		.read_s64		= ve_mnt_avail_nr_read,
 	},
 	{
 		.name			= "netif_max_nr",
diff --git a/tools/testing/selftests/ve/ve_ns_owner_test.c b/tools/testing/selftests/ve/ve_ns_owner_test.c
index b5b4f10be72b..3b5979fa63f6 100644
--- a/tools/testing/selftests/ve/ve_ns_owner_test.c
+++ b/tools/testing/selftests/ve/ve_ns_owner_test.c
@@ -18,7 +18,7 @@
  * with a new netns and mntns, the parent reads the new ve's counters
  * via cgroupfs and asserts they reflect the just-created namespaces:
  *   - ve.netns_avail_nr drops by exactly one (the new netns);
- *   - ve.mnt_nr is strictly greater than zero (mounts copied into the
+ *   - ve.mnt_avail_nr drops by exactly one (mounts copied into the
  *     new mntns are accounted to the new ve).
  *
  * We never assert against the parent ve's counters: those are shared
@@ -56,6 +56,7 @@
  * any spurious accounting against the parent ve would overflow it.
  */
 #define VE_NETNS_MAX		3
+#define VE_MOUNTS_MAX		4096
 
 static int write_file_at(int dirfd, const char *path, const char *val)
 {
@@ -187,7 +188,7 @@ static int clone_child_func(void *arg)
 /*
  * Before fix:
  *   - clone path: ve.netns_avail_nr stays at VE_NETNS_MAX and
- *     ve.mnt_nr stays at 0 because copy_net_ns()/copy_mnt_ns()
+ *     ve.mnt_avail_nr stays at VE_MOUNTS_MAX because copy_net_ns()/copy_mnt_ns()
  *     charged the parent ve via get_exec_env().
  *   - unshare path: the syscall itself returned -EINVAL, so this
  *     check was unreachable.
@@ -198,16 +199,16 @@ static int clone_child_func(void *arg)
 static void check_new_ve_owner(struct __test_metadata *_metadata,
 			       int cgv2_fd, int ctid)
 {
-	unsigned long long avail, mnt;
+	unsigned long long avail_netns, avail_mnt;
 	char path[64];
 
 	snprintf(path, sizeof(path), "%d/ve.netns_avail_nr", ctid);
-	ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail), 0);
-	EXPECT_EQ(avail, VE_NETNS_MAX - 1);
+	ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_netns), 0);
+	EXPECT_EQ(avail_netns, VE_NETNS_MAX - 1);
 
-	snprintf(path, sizeof(path), "%d/ve.mnt_nr", ctid);
-	ASSERT_EQ(read_u64_at(cgv2_fd, path, &mnt), 0);
-	EXPECT_GT(mnt, 0);
+	snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", ctid);
+	ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_mnt), 0);
+	EXPECT_LT(avail_mnt, VE_MOUNTS_MAX);
 }
 
 FIXTURE(ve_ns_owner)
@@ -218,7 +219,7 @@ FIXTURE(ve_ns_owner)
 
 FIXTURE_SETUP(ve_ns_owner)
 {
-	unsigned long long initial_mnt_nr;
+	unsigned long long initial_mnt_avail_nr;
 	char ctid_str[16];
 	char val[16];
 	char path[64];
@@ -268,13 +269,13 @@ FIXTURE_SETUP(ve_ns_owner)
 
 	/*
 	 * The new ve cgroup has not been entered by anything yet, so its
-	 * mnt_nr counter must start at 0. Each test below verifies that
+	 * mnt_avail_nr counter be VE_MOUNTS_MAX. Each test below verifies that
 	 * the clone/unshare populates the new mntns under this ve, i.e.
-	 * mnt_nr rises strictly above zero.
+	 * mnt_avail_nr strictly decreases.
 	 */
-	snprintf(path, sizeof(path), "%d/ve.mnt_nr", self->ctid);
-	ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_nr), 0);
-	ASSERT_EQ(initial_mnt_nr, 0);
+	snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", self->ctid);
+	ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_avail_nr), 0);
+	ASSERT_EQ(initial_mnt_avail_nr, VE_MOUNTS_MAX);
 };
 
 FIXTURE_TEARDOWN(ve_ns_owner)
-- 
2.47.1