[Devel] [PATCH VZ10 5/8] ve/nsproxy: forward new ve from copy_namespaces and ksys_unshare

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Wed Apr 29 16:41:39 MSK 2026


This is the actual fix for the problem where combining CLONE_NEWVE with
CLONE_NEWNET and/or CLONE_NEWNS in a single clone() or unshare() syscall
left the new netns/mntns owned by the parent's ve instead of the freshly
created one.

The cause was that copy_net_ns() and copy_mnt_ns() resolved the owning
ve via get_exec_env() at call time, while:

- in copy_process(), copy_ve_ns() updates only @tsk (the child) but
  current is still the parent;
- in ksys_unshare(), unshare_ve_namespace() builds a new ve_namespace
  in a local variable and only commits it to current after
  unshare_nsproxy_namespaces() has already run.

So both call paths produced a window where get_exec_env() returned the
wrong ve relative to the namespaces being created beside it.

Now that copy_mnt_ns() and copy_net_ns() accept an explicit owning ve,
forward it from:

- copy_namespaces(): when CLONE_NEWVE is set, use task_ve(tsk), which
  copy_ve_ns() has just updated to point at the new ve;
- unshare_nsproxy_namespaces(): a new struct ve_namespace * argument
  fed by ksys_unshare() with the new_ve_ns produced by the preceding
  unshare_ve_namespace() call.

Other callers of create_new_namespaces() (exec_task_namespaces() and
prepare_nsset() for setns) pass NULL: they never create a new ve.

After this patch a clone3()/unshare() with CLONE_NEWVE | CLONE_NEWNET
| CLONE_NEWNS attributes the new netns/mntns to the new ve, including
ve.netns_avail_nr / ve.mnt_nr accounting.

Note: The companion patch removes the now-unnecessary unshare-time guard
in check_unshare_flags() that used to forbid this combination outright.

https://virtuozzo.atlassian.net/browse/VSTOR-129744
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Feature: ve: ve generic structures
---
 include/linux/nsproxy.h |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/nsproxy.c        | 45 +++++++++++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 4dc0fe036bb9..438eb291a7f9 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -110,8 +110,9 @@ void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
 void free_nsproxy(struct nsproxy *ns);
+struct ve_namespace;
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
-	struct cred *, struct fs_struct *);
+	struct cred *, struct fs_struct *, struct ve_namespace *);
 int __init nsproxy_cache_init(void);
 
 static inline void put_nsproxy(struct nsproxy *ns)
diff --git a/kernel/fork.c b/kernel/fork.c
index 95d843b9dd1e..a1b9fec27579 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3360,7 +3360,7 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (err)
 		goto bad_unshare_cleanup_cred;
 	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
-					 new_cred, new_fs);
+					 new_cred, new_fs, new_ve_ns);
 	if (err)
 		goto bad_unshare_cleanup_ve_namespace;
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d0e8f9d98a3..069a103993d7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -67,7 +67,7 @@ static inline struct nsproxy *create_nsproxy(void)
  */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
 	struct task_struct *tsk, struct user_namespace *user_ns,
-	struct fs_struct *new_fs)
+	struct fs_struct *new_fs, struct ve_struct *new_ve)
 {
 	struct nsproxy *new_nsp;
 	int err;
@@ -77,7 +77,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		return ERR_PTR(-ENOMEM);
 
 	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns,
-				      new_fs, NULL);
+				      new_fs, new_ve);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
@@ -110,7 +110,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	}
 
 	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns,
-				      NULL);
+				      new_ve);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
@@ -156,6 +156,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
+	struct ve_struct *new_ve = NULL;
 
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
@@ -179,7 +180,20 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		(CLONE_NEWIPC | CLONE_SYSVSEM))
 		return -EINVAL;
 
-	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+#ifdef CONFIG_VE
+	/*
+	 * If a new ve namespace was just created for tsk by copy_ve_ns()
+	 * (which runs immediately before us in copy_process), use that ve
+	 * as the owner for new mount/net namespaces created in the same
+	 * clone. Otherwise get_exec_env() in the callees would resolve to
+	 * the parent's ve and we'd end up with owner_ve/ve_owner pointing
+	 * at the wrong ve.
+	 */
+	if (flags & CLONE_NEWVE)
+		new_ve = task_ve(tsk);
+#endif
+
+	new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs, new_ve);
 	if (IS_ERR(new_ns))
 		return  PTR_ERR(new_ns);
 
@@ -214,9 +228,11 @@ void free_nsproxy(struct nsproxy *ns)
  * On success, returns the new nsproxy.
  */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
+	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs,
+	struct ve_namespace *new_ve_ns)
 {
 	struct user_namespace *user_ns;
+	struct ve_struct *new_ve = NULL;
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -228,8 +244,20 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_VE
+	/*
+	 * unshare_ve_namespace() ran before us and may have allocated a
+	 * new ve_namespace which is not yet installed on current. Pass
+	 * its ve down so that new mount/net namespaces created here are
+	 * owned by it instead of current's (about to be replaced) ve.
+	 */
+	if (new_ve_ns)
+		new_ve = new_ve_ns->ve;
+#endif
+
 	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
-					 new_fs ? new_fs : current->fs);
+					 new_fs ? new_fs : current->fs,
+					 new_ve);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
 		goto out;
@@ -267,7 +295,7 @@ int exec_task_namespaces(void)
 	if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
 		return 0;
 
-	new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+	new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs, NULL);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
@@ -341,7 +369,8 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
 {
 	struct task_struct *me = current;
 
-	nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+	nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs,
+					       NULL);
 	if (IS_ERR(nsset->nsproxy))
 		return PTR_ERR(nsset->nsproxy);
 
-- 
2.53.0



More information about the Devel mailing list