[Devel] [PATCH VZ10 3/6] ve/fs: Rework per-ve mount count
Vasileios Almpanis
vasileios.almpanis at virtuozzo.com
Thu Jul 2 13:48:13 MSK 2026
On 6/30/26 7:13 PM, Vladimir Riabchun wrote:
> Previous approach with current mounts counter had an issue:
> there was a gap between ve_mount_allowed check and ve_mount_nr_inc,
> which could allow CT to have more mounts than expected.
>
> Fix this by tracking the number of available mounts instead
> of current ones. This also makes resources accounting
> more consistent - we are using ***_avail_nr approach more.
>
> One more issue with inconsistent ve value is fixed:
> ve_mount_allowed always used ve from get_exec_env, but
> ve_mount_nr_inc operated with owner_ve.
> Now actual ve value is calculated in the beginning of alloc_vfsmnt.
>
> ve_ns_owner_test is updated as well.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-135520
>
> Feature: per-ve failcounters
> Signed-off-by: Vladimir Riabchun <vladimir.riabchun at virtuozzo.com>
> ---
> fs/namespace.c | 52 ++++++++++---------
> include/linux/ve.h | 2 +-
> kernel/ve/ve.c | 12 ++---
> tools/testing/selftests/ve/ve_ns_owner_test.c | 29 ++++++-----
> 4 files changed, 49 insertions(+), 46 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d2fa363b9a56..32452898aadb 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -317,15 +317,18 @@ int mnt_get_count(struct mount *mnt)
> #endif
> }
>
> -static inline int ve_mount_allowed(void);
> -static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve);
> -static inline void ve_mount_nr_dec(struct mount *mnt);
> +static inline int ve_try_reserve_mount(struct ve_struct *ve);
> +static inline void ve_mount_put(struct mount *mnt);
>
> static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
> {
> struct mount *mnt;
> + struct ve_struct *ve = owner_ve;
>
> - if (!ve_mount_allowed()) {
> + if (!ve)
> + ve = get_exec_env();
> +
> + if (!ve_try_reserve_mount(ve)) {
> pr_warn_ratelimited(
> "CT#%s reached the limit on mounts.\n",
> ve_name(get_exec_env()));
> @@ -370,7 +373,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
> INIT_LIST_HEAD(&mnt->mnt_umounting);
> INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
> mnt->mnt.mnt_idmap = &nop_mnt_idmap;
> - ve_mount_nr_inc(mnt, owner_ve);
> + /* Got ve reference in ve_try_reserve_mount */
> + mnt->ve_owner = ve;
> + } else {
> + goto out_nomem;
> }
> return mnt;
>
> @@ -382,6 +388,10 @@ static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
> mnt_free_id(mnt);
> out_free_cache:
> kmem_cache_free(mnt_cache, mnt);
> +out_nomem:
> + /* Got ve reference in ve_try_reserve_mount */
> + atomic_inc(&ve->mnt_avail_nr);
Here you increment unconditionally but in ve_try_reserve_mount you do it
conditionally.
> + put_ve(ve);
> return NULL;
> }
>
> @@ -750,7 +760,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
> static void free_vfsmnt(struct mount *mnt)
> {
> mnt_idmap_put(mnt_idmap(&mnt->mnt));
> - ve_mount_nr_dec(mnt);
> + ve_mount_put(mnt);
> kfree_const(mnt->mnt_devname);
> #ifdef CONFIG_SMP
> free_percpu(mnt->mnt_pcp);
> @@ -3258,28 +3268,21 @@ int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remou
> return err;
> }
>
> -static inline int ve_mount_allowed(void)
> +static inline int ve_try_reserve_mount(struct ve_struct *ve)
> {
> - struct ve_struct *ve = get_exec_env();
> -
> - return ve_is_super(ve) || ve->is_pseudosuper ||
> - atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
> -}
> -
> -static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve)
> -{
> - if (!ve)
> - ve = get_exec_env();
> -
> - mnt->ve_owner = get_ve(ve);
> - atomic_inc(&ve->mnt_nr);
Generic comment, previous code was incrementing and decrementing
unconditionally.
Now you do it conditionally and maybe criu could be running, set
pseudosuper to 1 during mount
and skew the counter. Its very theoretical scenario but we reserve an
available mount because of pseudosuper value
and it is restored when the mount is freed which in this moment
pseudosuper could be 1 and the counter could become skewed.
> + int ret = ve_is_super(ve) || ve->is_pseudosuper ||
> + atomic_dec_if_positive(&ve->mnt_avail_nr) >= 0;
> + if (ret)
> + get_ve(ve);
> + return ret;
> }
>
> -static inline void ve_mount_nr_dec(struct mount *mnt)
> +static inline void ve_mount_put(struct mount *mnt)
> {
> struct ve_struct *ve = mnt->ve_owner;
>
> - atomic_dec(&ve->mnt_nr);
> + if (!ve_is_super(ve) && !ve->is_pseudosuper)
> + atomic_inc(&ve->mnt_avail_nr);
> put_ve(ve);
> mnt->ve_owner = NULL;
> }
> @@ -3303,9 +3306,8 @@ bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
>
> #else /* CONFIG_VE */
>
> -static inline int ve_mount_allowed(void) { return 1; }
> -static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve) { }
> -static inline void ve_mount_nr_dec(struct mount *mnt) { }
> +static inline int ve_try_reserve_mount(struct ve_struct *ve) { return 1; }
> +static inline void ve_mount_put(struct mount *mnt) { }
> #endif /* CONFIG_VE */
>
> static int ve_prepare_mount_options(struct fs_context *fc, void *data)
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 5bbdbd3d5aa6..b2b3d4c12e39 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -87,7 +87,7 @@ struct ve_struct {
> atomic_t nd_neigh_nr;
> unsigned long meminfo_val;
>
> - atomic_t mnt_nr; /* number of present VE mounts */
> + atomic_t mnt_avail_nr; /* number of present VE mounts */
>
> #ifdef CONFIG_COREDUMP
> char core_pattern[CORENAME_MAX_SIZE];
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 56a869cec237..0fb5a2c1f63a 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -78,7 +78,7 @@ struct ve_struct ve0 = {
>
> .arp_neigh_nr = ATOMIC_INIT(0),
> .nd_neigh_nr = ATOMIC_INIT(0),
> - .mnt_nr = ATOMIC_INIT(0),
> + .mnt_avail_nr = ATOMIC_INIT(INT_MAX),
> .meminfo_val = VE_MEMINFO_SYSTEM,
> .umh_running_helpers = ATOMIC_INIT(0),
> .umh_helpers_waitq = __WAIT_QUEUE_HEAD_INITIALIZER(ve0.umh_helpers_waitq),
> @@ -774,7 +774,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>
> atomic_set(&ve->arp_neigh_nr, 0);
> atomic_set(&ve->nd_neigh_nr, 0);
> - atomic_set(&ve->mnt_nr, 0);
> + atomic_set(&ve->mnt_avail_nr, sysctl_ve_mount_nr);
>
> #ifdef CONFIG_COREDUMP
> strcpy(ve->core_pattern, "core");
> @@ -1048,9 +1048,9 @@ static u64 ve_netns_avail_nr_read(struct cgroup_subsys_state *css, struct cftype
> return atomic_read(&css_to_ve(css)->netns_avail_nr);
> }
>
> -static u64 ve_mnt_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
> +static s64 ve_mnt_avail_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
> {
> - return atomic_read(&css_to_ve(css)->mnt_nr);
> + return atomic_read(&css_to_ve(css)->mnt_avail_nr);
> }
>
> static u64 ve_netif_max_nr_read(struct cgroup_subsys_state *css, struct cftype *cft)
> @@ -1610,8 +1610,8 @@ static struct cftype ve_cftypes[] = {
> .read_u64 = ve_netns_avail_nr_read,
> },
> {
> - .name = "mnt_nr",
> - .read_u64 = ve_mnt_nr_read,
> + .name = "mnt_avail_nr",
> + .read_s64 = ve_mnt_avail_nr_read,
> },
> {
> .name = "netif_max_nr",
> diff --git a/tools/testing/selftests/ve/ve_ns_owner_test.c b/tools/testing/selftests/ve/ve_ns_owner_test.c
> index b5b4f10be72b..3b5979fa63f6 100644
> --- a/tools/testing/selftests/ve/ve_ns_owner_test.c
> +++ b/tools/testing/selftests/ve/ve_ns_owner_test.c
> @@ -18,7 +18,7 @@
> * with a new netns and mntns, the parent reads the new ve's counters
> * via cgroupfs and asserts they reflect the just-created namespaces:
> * - ve.netns_avail_nr drops by exactly one (the new netns);
> - * - ve.mnt_nr is strictly greater than zero (mounts copied into the
> + * - ve.mnt_avail_nr drops by exactly one (mounts copied into the
> * new mntns are accounted to the new ve).
> *
> * We never assert against the parent ve's counters: those are shared
> @@ -56,6 +56,7 @@
> * any spurious accounting against the parent ve would overflow it.
> */
> #define VE_NETNS_MAX 3
> +#define VE_MOUNTS_MAX 4096
>
> static int write_file_at(int dirfd, const char *path, const char *val)
> {
> @@ -187,7 +188,7 @@ static int clone_child_func(void *arg)
> /*
> * Before fix:
> * - clone path: ve.netns_avail_nr stays at VE_NETNS_MAX and
> - * ve.mnt_nr stays at 0 because copy_net_ns()/copy_mnt_ns()
> + * ve.mnt_avail_nr stays at VE_MOUNTS_MAX because copy_net_ns()/copy_mnt_ns()
> * charged the parent ve via get_exec_env().
> * - unshare path: the syscall itself returned -EINVAL, so this
> * check was unreachable.
> @@ -198,16 +199,16 @@ static int clone_child_func(void *arg)
> static void check_new_ve_owner(struct __test_metadata *_metadata,
> int cgv2_fd, int ctid)
> {
> - unsigned long long avail, mnt;
> + unsigned long long avail_netns, avail_mnt;
> char path[64];
>
> snprintf(path, sizeof(path), "%d/ve.netns_avail_nr", ctid);
> - ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail), 0);
> - EXPECT_EQ(avail, VE_NETNS_MAX - 1);
> + ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_netns), 0);
> + EXPECT_EQ(avail_netns, VE_NETNS_MAX - 1);
>
> - snprintf(path, sizeof(path), "%d/ve.mnt_nr", ctid);
> - ASSERT_EQ(read_u64_at(cgv2_fd, path, &mnt), 0);
> - EXPECT_GT(mnt, 0);
> + snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", ctid);
> + ASSERT_EQ(read_u64_at(cgv2_fd, path, &avail_mnt), 0);
> + EXPECT_LT(avail_mnt, VE_MOUNTS_MAX);
> }
>
> FIXTURE(ve_ns_owner)
> @@ -218,7 +219,7 @@ FIXTURE(ve_ns_owner)
>
> FIXTURE_SETUP(ve_ns_owner)
> {
> - unsigned long long initial_mnt_nr;
> + unsigned long long initial_mnt_avail_nr;
> char ctid_str[16];
> char val[16];
> char path[64];
> @@ -268,13 +269,13 @@ FIXTURE_SETUP(ve_ns_owner)
>
> /*
> * The new ve cgroup has not been entered by anything yet, so its
> - * mnt_nr counter must start at 0. Each test below verifies that
> + * mnt_avail_nr counter be VE_MOUNTS_MAX. Each test below verifies that
> * the clone/unshare populates the new mntns under this ve, i.e.
> - * mnt_nr rises strictly above zero.
> + * mnt_avail_nr strictly decreases.
> */
> - snprintf(path, sizeof(path), "%d/ve.mnt_nr", self->ctid);
> - ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_nr), 0);
> - ASSERT_EQ(initial_mnt_nr, 0);
> + snprintf(path, sizeof(path), "%d/ve.mnt_avail_nr", self->ctid);
> + ASSERT_EQ(read_u64_at(self->cgv2_fd, path, &initial_mnt_avail_nr), 0);
> + ASSERT_EQ(initial_mnt_avail_nr, VE_MOUNTS_MAX);
> };
>
> FIXTURE_TEARDOWN(ve_ns_owner)
--
Best regards, Vasileios Almpanis
Software Developer, Virtuozzo.
More information about the Devel
mailing list