[Devel] [PATCH VZ10 5/8] ve/nsproxy: forward new ve from copy_namespaces and ksys_unshare
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Wed Apr 29 16:41:39 MSK 2026
This is the actual fix for the problem where combining CLONE_NEWVE with
CLONE_NEWNET and/or CLONE_NEWNS in a single clone() or unshare() syscall
left the new netns/mntns owned by the parent's ve instead of the freshly
created one.
The cause was that copy_net_ns() and copy_mnt_ns() resolved the owning
ve via get_exec_env() at call time, while:
- in copy_process(), copy_ve_ns() updates only @tsk (the child) but
current is still the parent;
- in ksys_unshare(), unshare_ve_namespace() builds a new ve_namespace
in a local variable and only commits it to current after
unshare_nsproxy_namespaces() has already run.
So both call paths produced a window where get_exec_env() returned the
wrong ve relative to the namespaces being created beside it.
Now that copy_mnt_ns() and copy_net_ns() accept an explicit owning ve,
forward it from:
- copy_namespaces(): when CLONE_NEWVE is set, use task_ve(tsk), which
copy_ve_ns() has just updated to point at the new ve;
- unshare_nsproxy_namespaces(): a new struct ve_namespace * argument
fed by ksys_unshare() with the new_ve_ns produced by the preceding
unshare_ve_namespace() call.
Other callers of create_new_namespaces() (exec_task_namespaces() and
prepare_nsset() for setns) pass NULL: they never create a new ve.
After this patch a clone3()/unshare() with CLONE_NEWVE | CLONE_NEWNET
| CLONE_NEWNS attributes the new netns/mntns to the new ve, including
ve.netns_avail_nr / ve.mnt_nr accounting.
Note: The companion patch removes the now-unnecessary unshare-time guard
in check_unshare_flags() that used to forbid this combination outright.
https://virtuozzo.atlassian.net/browse/VSTOR-129744
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Feature: ve: ve generic structures
---
include/linux/nsproxy.h | 3 ++-
kernel/fork.c | 2 +-
kernel/nsproxy.c | 45 +++++++++++++++++++++++++++++++++--------
3 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 4dc0fe036bb9..438eb291a7f9 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -110,8 +110,9 @@ void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
void free_nsproxy(struct nsproxy *ns);
+struct ve_namespace;
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
- struct cred *, struct fs_struct *);
+ struct cred *, struct fs_struct *, struct ve_namespace *);
int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
diff --git a/kernel/fork.c b/kernel/fork.c
index 95d843b9dd1e..a1b9fec27579 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3360,7 +3360,7 @@ int ksys_unshare(unsigned long unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
- new_cred, new_fs);
+ new_cred, new_fs, new_ve_ns);
if (err)
goto bad_unshare_cleanup_ve_namespace;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d0e8f9d98a3..069a103993d7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -67,7 +67,7 @@ static inline struct nsproxy *create_nsproxy(void)
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
struct task_struct *tsk, struct user_namespace *user_ns,
- struct fs_struct *new_fs)
+ struct fs_struct *new_fs, struct ve_struct *new_ve)
{
struct nsproxy *new_nsp;
int err;
@@ -77,7 +77,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return ERR_PTR(-ENOMEM);
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns,
- new_fs, NULL);
+ new_fs, new_ve);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
@@ -110,7 +110,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
}
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns,
- NULL);
+ new_ve);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
@@ -156,6 +156,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
+ struct ve_struct *new_ve = NULL;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
@@ -179,7 +180,20 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
(CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;
- new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+#ifdef CONFIG_VE
+ /*
+ * If a new ve namespace was just created for tsk by copy_ve_ns()
+ * (which runs immediately before us in copy_process), use that ve
+ * as the owner for new mount/net namespaces created in the same
+ * clone. Otherwise get_exec_env() in the callees would resolve to
+ * the parent's ve and we'd end up with owner_ve/ve_owner pointing
+ * at the wrong ve.
+ */
+ if (flags & CLONE_NEWVE)
+ new_ve = task_ve(tsk);
+#endif
+
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs, new_ve);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
@@ -214,9 +228,11 @@ void free_nsproxy(struct nsproxy *ns)
* On success, returns the new nsproxy.
*/
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
- struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
+ struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs,
+ struct ve_namespace *new_ve_ns)
{
struct user_namespace *user_ns;
+ struct ve_struct *new_ve = NULL;
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
@@ -228,8 +244,20 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
+#ifdef CONFIG_VE
+ /*
+ * unshare_ve_namespace() ran before us and may have allocated a
+ * new ve_namespace which is not yet installed on current. Pass
+ * its ve down so that new mount/net namespaces created here are
+ * owned by it instead of current's (about to be replaced) ve.
+ */
+ if (new_ve_ns)
+ new_ve = new_ve_ns->ve;
+#endif
+
*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
- new_fs ? new_fs : current->fs);
+ new_fs ? new_fs : current->fs,
+ new_ve);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);
goto out;
@@ -267,7 +295,7 @@ int exec_task_namespaces(void)
if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
return 0;
- new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs, NULL);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -341,7 +369,8 @@ static int prepare_nsset(unsigned flags, struct nsset *nsset)
{
struct task_struct *me = current;
- nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs,
+ NULL);
if (IS_ERR(nsset->nsproxy))
return PTR_ERR(nsset->nsproxy);
--
2.53.0
More information about the Devel
mailing list