[Devel] [PATCH RHEL10 COMMIT] ve/mount: thread owning ve through alloc_vfsmnt/clone_mnt/copy_tree

Thu May 14 18:50:48 MSK 2026

The commit is pushed to "branch-rh10-6.12.0-55.52.1.5.x.vz10-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh10-6.12.0-55.52.1.5.24.vz10
------>
commit 74283849022503f0553f4330d5edb0005e889e6d
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Wed Apr 29 15:41:36 2026 +0200

    ve/mount: thread owning ve through alloc_vfsmnt/clone_mnt/copy_tree
    
    Add owner_ve parameter to alloc_vfsmnt(), clone_mnt() and copy_tree(),
    to identify correct ownership of mount for accounting. NULL preserves
    the existing behaviour of taking current ve via get_exec_env().
    
    This will be used to derive correct ownership of newly created mounts in
    case of simultaneous creation of new ve namespace and mount namespace,
    where we would like new mounts to have the new ve as an owner.
    
    There is no change in behaviour yet: copy_mnt_ns() still resolves the
    owner from current ve via get_exec_env().
    
    https://virtuozzo.atlassian.net/browse/VSTOR-129744
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    Reviewed-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
    
    Feature: ve: ve generic structures
    ======
    Patchset description:
    ve: fix owner_ve of net/mnt namespaces created together with CLONE_NEWVE
    
    When CLONE_NEWVE is combined with CLONE_NEWNET and/or CLONE_NEWNS in a
    single clone3() or unshare(), copy_net_ns() and copy_mnt_ns() resolve
    the owning ve via get_exec_env(), which still points at the parent ve
    at that point. The freshly created net/mnt namespaces end up wired to
    the wrong ve, and unshare(CLONE_NEWVE | CLONE_NEW{NS,NET}) is rejected
    outright by check_unshare_flags().
    
    Fix it by threading the new ve from copy_namespaces() and
    unshare_nsproxy_namespaces() down into copy_net_ns() and copy_mnt_ns(),
    so the correct ve is charged for the new netns and for every mount in
    the new mntns.
    
    Patches 1-4 are pure plumbing (signature changes, no behaviour change).
    Patch 5 is the actual fix that forwards the new ve. Patch 6 drops the
    now-redundant CLONE_NEWVE-alone restriction in check_unshare_flags().
    Patch 7 exposes ve.mnt_nr via cgroupfs to make per-ve mount accounting
    observable from userspace. Patch 8 adds a selftest covering both the
    clone3() and unshare() paths.
    
    Verified with crash on a vzctl-started container: task_ve,
    nsproxy->net_ns->owner_ve, nsproxy->mnt_ns->ve_owner and
    nsproxy->mnt_ns->root.ve_owner all resolve to the new ve.
    The new selftest passes both cases.
---
 fs/namespace.c | 43 +++++++++++++++++++++++++------------------
 fs/pnode.c     |  2 +-
 fs/pnode.h     |  5 ++++-
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index dd10ed5007ea2..ba2cee9a6db1c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -308,10 +308,10 @@ int mnt_get_count(struct mount *mnt)
 }
 
 static inline int ve_mount_allowed(void);
-static inline void ve_mount_nr_inc(struct mount *mnt);
+static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve);
 static inline void ve_mount_nr_dec(struct mount *mnt);
 
-static struct mount *alloc_vfsmnt(const char *name)
+static struct mount *alloc_vfsmnt(const char *name, struct ve_struct *owner_ve)
 {
 	struct mount *mnt;
 
@@ -360,7 +360,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
-		ve_mount_nr_inc(mnt);
+		ve_mount_nr_inc(mnt, owner_ve);
 	}
 	return mnt;
 
@@ -1223,7 +1223,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
 	if (!fc->root)
 		return ERR_PTR(-EINVAL);
 
-	mnt = alloc_vfsmnt(fc->source ?: "none");
+	mnt = alloc_vfsmnt(fc->source ?: "none", NULL);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
@@ -1325,13 +1325,13 @@ vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
 EXPORT_SYMBOL_GPL(vfs_submount);
 
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
-					int flag)
+				int flag, struct ve_struct *owner_ve)
 {
 	struct super_block *sb = old->mnt.mnt_sb;
 	struct mount *mnt;
 	int err;
 
-	mnt = alloc_vfsmnt(old->mnt_devname);
+	mnt = alloc_vfsmnt(old->mnt_devname, owner_ve);
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
@@ -1565,7 +1565,7 @@ EXPORT_SYMBOL(path_is_mountpoint);
 struct vfsmount *mnt_clone_internal(const struct path *path)
 {
 	struct mount *p;
-	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE, NULL);
 	if (IS_ERR(p))
 		return ERR_CAST(p);
 	p->mnt.mnt_flags |= MNT_INTERNAL;
@@ -2153,7 +2153,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
 }
 
 struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
-					int flag)
+			int flag, struct ve_struct *owner_ve)
 {
 	struct mount *res, *src_parent, *src_root_child, *src_mnt,
 		*dst_parent, *dst_mnt;
@@ -2164,7 +2164,7 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
 		return ERR_PTR(-EINVAL);
 
-	res = dst_mnt = clone_mnt(src_root, dentry, flag);
+	res = dst_mnt = clone_mnt(src_root, dentry, flag, owner_ve);
 	if (IS_ERR(dst_mnt))
 		return dst_mnt;
 
@@ -2200,7 +2200,8 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
 
 			src_parent = src_mnt;
 			dst_parent = dst_mnt;
-			dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
+			dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag,
+					    owner_ve);
 			if (IS_ERR(dst_mnt))
 				goto out;
 			lock_mount_hash();
@@ -2230,7 +2231,7 @@ struct vfsmount *collect_mounts(const struct path *path)
 		tree = ERR_PTR(-EINVAL);
 	else
 		tree = copy_tree(real_mount(path->mnt), path->dentry,
-				 CL_COPY_ALL | CL_PRIVATE);
+				 CL_COPY_ALL | CL_PRIVATE, NULL);
 	namespace_unlock();
 	if (IS_ERR(tree))
 		return ERR_CAST(tree);
@@ -2311,7 +2312,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
 	if (has_locked_children(old_mnt, path->dentry))
 		goto invalid;
 
-	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE, NULL);
 	up_read(&namespace_sem);
 
 	if (IS_ERR(new_mnt))
@@ -2803,9 +2804,10 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
 		return mnt;
 
 	if (recurse)
-		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE,
+				NULL);
 	else
-		mnt = clone_mnt(old, old_path->dentry, 0);
+		mnt = clone_mnt(old, old_path->dentry, 0, NULL);
 
 	if (!IS_ERR(mnt))
 		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
@@ -3221,9 +3223,10 @@ static inline int ve_mount_allowed(void)
 		atomic_read(&ve->mnt_nr) < (int)sysctl_ve_mount_nr;
 }
 
-static inline void ve_mount_nr_inc(struct mount *mnt)
+static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve)
 {
-	struct ve_struct *ve = get_exec_env();
+	if (!ve)
+		ve = get_exec_env();
 
 	mnt->ve_owner = get_ve(ve);
 	atomic_inc(&ve->mnt_nr);
@@ -3258,7 +3261,7 @@ bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
 #else /* CONFIG_VE */
 
 static inline int ve_mount_allowed(void) { return 1; }
-static inline void ve_mount_nr_inc(struct mount *mnt) { }
+static inline void ve_mount_nr_inc(struct mount *mnt, struct ve_struct *ve) { }
 static inline void ve_mount_nr_dec(struct mount *mnt) { }
 #endif /* CONFIG_VE */
 
@@ -4235,7 +4238,11 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SHARED_TO_SLAVE;
-	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+#ifdef CONFIG_VE
+	new = copy_tree(old, old->mnt.mnt_root, copy_flags, new_ns->ve_owner);
+#else
+	new = copy_tree(old, old->mnt.mnt_root, copy_flags, NULL);
+#endif
 	if (IS_ERR(new)) {
 		namespace_unlock();
 		ns_free_inum(&new_ns->ns);
diff --git a/fs/pnode.c b/fs/pnode.c
index a799e0315cc9a..a91e7aac1601d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,7 +257,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 			type |= CL_MAKE_SHARED;
 	}
 		
-	child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+	child = copy_tree(last_source, last_source->mnt.mnt_root, type, NULL);
 	if (IS_ERR(child))
 		return PTR_ERR(child);
 	read_seqlock_excl(&mount_lock);
diff --git a/fs/pnode.h b/fs/pnode.h
index 0b02a63938911..c865dd91f0a37 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,6 +10,8 @@
 #include <linux/list.h>
 #include "mount.h"
 
+struct ve_struct;
+
 #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
 #define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
@@ -49,7 +51,8 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
 			   struct mount *mnt);
-struct mount *copy_tree(struct mount *, struct dentry *, int);
+struct mount *copy_tree(struct mount *, struct dentry *, int,
+			struct ve_struct *);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
 int count_mounts(struct mnt_namespace *ns, struct mount *mnt);