[Devel] [PATCH RHEL10 COMMIT] ve/nsproxy: forward new ve from copy_namespaces and ksys_unshare

Konstantin Khorenko khorenko at virtuozzo.com
Thu May 14 18:52:06 MSK 2026


The commit is pushed to "branch-rh10-6.12.0-55.52.1.5.x.vz10-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh10-6.12.0-55.52.1.5.24.vz10
------>
commit 540cbf41ae3b4506321d61378a72e239d332d192
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Wed Apr 29 15:41:39 2026 +0200

    ve/nsproxy: forward new ve from copy_namespaces and ksys_unshare
    
    This is the actual fix for the problem where combining CLONE_NEWVE with
    CLONE_NEWNET and/or CLONE_NEWNS in a single clone() or unshare() syscall
    left the new netns/mntns owned by the parent's ve instead of the freshly
    created one.
    
    The cause was that copy_net_ns() and copy_mnt_ns() resolved the owning
    ve via get_exec_env() at call time, while:
    
    - in copy_process(), copy_ve_ns() updates only @tsk (the child) but
      current is still the parent;
    - in ksys_unshare(), unshare_ve_namespace() builds a new ve_namespace
      in a local variable and only commits it to current after
      unshare_nsproxy_namespaces() has already run.
    
    So both call paths produced a window where get_exec_env() returned the
    wrong ve relative to the namespaces being created beside it.
    
    Now that copy_mnt_ns() and copy_net_ns() accept an explicit owning ve,
    forward it from:
    
    - copy_namespaces(): when CLONE_NEWVE is set, use task_ve(tsk), which
      copy_ve_ns() has just updated to point at the new ve;
    - unshare_nsproxy_namespaces(): a new struct ve_namespace * argument
      fed by ksys_unshare() with the new_ve_ns produced by the preceding
      unshare_ve_namespace() call.
    
    Other callers of create_new_namespaces() (exec_task_namespaces() and
    prepare_nsset() for setns) pass NULL: they never create a new ve.
    
    After this patch a clone3()/unshare() with CLONE_NEWVE | CLONE_NEWNET
    | CLONE_NEWNS attributes the new netns/mntns to the new ve, including
    ve.netns_avail_nr / ve.mnt_nr accounting.
    
    Note: The companion patch removes the now-unnecessary unshare-time guard
    in check_unshare_flags() that used to forbid this combination outright.
    
    https://virtuozzo.atlassian.net/browse/VSTOR-129744
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    Reviewed-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
    
    Feature: ve: ve generic structures
    ======
    Patchset description:
    ve: fix owner_ve of net/mnt namespaces created together with CLONE_NEWVE
    
    When CLONE_NEWVE is combined with CLONE_NEWNET and/or CLONE_NEWNS in a
    single clone3() or unshare(), copy_net_ns() and copy_mnt_ns() resolve
    the owning ve via get_exec_env(), which still points at the parent ve
    at that point. The freshly created net/mnt namespaces end up wired to
    the wrong ve, and unshare(CLONE_NEWVE | CLONE_NEW{NS,NET}) is rejected
    outright by check_unshare_flags().
    
    Fix it by threading the new ve from copy_namespaces() and
    unshare_nsproxy_namespaces() down into copy_net_ns() and copy_mnt_ns(),
    so the correct ve is charged for the new netns and for every mount in
    the new mntns.
    
    Patches 1-4 are pure plumbing (signature changes, no behaviour change).
    Patch 5 is the actual fix that forwards the new ve. Patch 6 drops the
    now-redundant CLONE_NEWVE-alone restriction in check_unshare_flags().
    Patch 7 exposes ve.mnt_nr via cgroupfs to make per-ve mount accounting
    observable from userspace. Patch 8 adds a selftest covering both the
    clone3() and unshare() paths.
    
    Verified with crash on a vzctl-started container: task_ve,
    nsproxy->net_ns->owner_ve, nsproxy->mnt_ns->ve_owner and
    nsproxy->mnt_ns->root.ve_owner all resolve to the new ve.
    The new selftest passes both cases.
---
 include/linux/nsproxy.h |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/nsproxy.c        | 45 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 4dc0fe036bb9c..438eb291a7f93 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -110,8 +110,9 @@ void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
 void free_nsproxy(struct nsproxy *ns);
+struct ve_namespace;
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
-	struct cred *, struct fs_struct *);
+	struct cred *, struct fs_struct *, struct ve_namespace *);
 int __init nsproxy_cache_init(void);
 
 static inline void put_nsproxy(struct nsproxy *ns)
diff --git a/kernel/fork.c b/kernel/fork.c
index 95d843b9dd1e1..a1b9fec275799 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3360,7 +3360,7 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (err)
 		goto bad_unshare_cleanup_cred;
 	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
-					 new_cred, new_fs);
+					 new_cred, new_fs, new_ve_ns);
 	if (err)
 		goto bad_unshare_cleanup_ve_namespace;
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d0e8f9d98a37..069a103993d7c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -67,7 +67,7 @@ static inline struct nsproxy *create_nsproxy(void)
  */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
 	struct task_struct *tsk, struct user_namespace *user_ns,
-	struct fs_struct *new_fs)
+	struct fs_struct *new_fs, struct ve_struct *new_ve)
 {
 	struct nsproxy *new_nsp;
 	int err;
@@ -77,7 +77,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		return ERR_PTR(-ENOMEM);
 
 	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns,
-				      new_fs, NULL);
+				      new_fs, new_ve);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
@@ -110,7 +110,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	}
 
 	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns,
-				      NULL);
+				      new_ve);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
@@ -156,6 +156,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
+	struct ve_struct *new_ve = NULL;
 
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
@@ -179,7 +180,20 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		(CLONE_NEWIPC | CLONE_SYSVSEM))
 		return -EINVAL;
 
-	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+#ifdef CONFIG_VE
+	/*
+	 * If a new ve namespace was just created for tsk by copy_ve_ns()
+	 * (which runs immediately before us in copy_process), use that ve
+	 * as the owner for new mount/net namespaces created in the same
+	 * clone. Otherwise get_exec_env() in the callees would resolve to
+	 * the parent's ve and we'd end up with owner_ve/ve_owner pointing
+	 * at the wrong ve.
+	 */
+	if (flags & CLONE_NEWVE)
+		new_ve = task_ve(tsk);
+#endif
+
+	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs, new_ve);
 	if (IS_ERR(new_ns))
 		return  PTR_ERR(new_ns);
 
@@ -214,9 +228,11 @@ void free_nsproxy(struct nsproxy *ns)
  * On success, returns the new nsproxy.
  */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
+	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs,
+	struct ve_namespace *new_ve_ns)
 {
 	struct user_namespace *user_ns;
+	struct ve_struct *new_ve = NULL;
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -228,8 +244,20 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_VE
+	/*
+	 * unshare_ve_namespace() ran before us and may have allocated a
+	 * new ve_namespace which is not yet installed on current. Pass
+	 * its ve down so that new mount/net namespaces created here are
+	 * owned by it instead of current's (about to be replaced) ve.
+	 */
+	if (new_ve_ns)
+		new_ve = new_ve_ns->ve;
+#endif
+
 	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
-					 new_fs ? new_fs : current->fs);
+					 new_fs ? new_fs : current->fs,
+					 new_ve);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
 		goto out;
@@ -267,7 +295,7 @@ int exec_task_namespaces(void)
 	if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
 		return 0;
 
-	new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+	new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs, NULL);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -341,7 +369,8 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
 {
 	struct task_struct *me = current;
 
-	nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+	nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs,
+					       NULL);
 	if (IS_ERR(nsset->nsproxy))
 		return PTR_ERR(nsset->nsproxy);
 


More information about the Devel mailing list