[Devel] [PATCH 1/1] User namespaces: fix refcounting

Serge E. Hallyn serue at us.ibm.com
Wed Oct 8 07:42:46 PDT 2008


Hi David,

attached is a respin of the three user namespace cleanup patches I've
been trying to push upstream.  I've ported them on top of James'
next-creds-subsys branch (and consolidated into a single patch bc the
splitup wasn't perfect anyway).

These are prep patches on top of which I can then start the real user
namespace work.  If this looks ok, then I will try to get it into
linux-next after next-creds-subsys goes in there.

Perhaps the most objectionable part of this to you may be the
__task_commit_creds().  In the case of clone(CLONE_NEWUSER),
current is not the task which will receive the new credentials, so
I need to be able to set creds on another task.  The other task is
however still being created, so it is safe in this case.  I kept
__task_commit_creds out of header files assuming that noone else
should be able to use it, ever.

Does this look ok to you?

thanks,
-serge

>From b09c70161f58e1d74e84415f29bfaf124367b641 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue at us.ibm.com>
Date: Tue, 7 Oct 2008 20:54:59 -0500
Subject: [PATCH 1/1] User namespaces: fix refcounting

When a task does clone(CLONE_NEWNS), the task's user is the 'creator' of the
new user_namespace, and the user_namespace is tacked onto a list of those
created by this user.

Move the user namespace from the nsproxy to the struct user_struct, because
	1. otherwise a struct cred or struct user_struct is insufficient to
	   unambiguously determine uid equivalence
	2. this nicely fits the hierarchical user namespace+struct user_struct
	   model.

When a user_namespace is created, the user which created it is
marked as its 'creator'.  The user_namespace pins the creator.
Each userid in a user_ns pins the user_ns.  This keeps refcounting
nice and simple.

At the same time, this patch simplifies the refcounting.  The current
user and userns locking works as follows:

        The task pins the user struct.

        The task pins the nsproxy, the nsproxy pins the user_ns.

        When a new user_ns is created, it creates a root user for
        it, and pins it.  When the nsproxy releases the user_ns,
        the userns tries to release all its user structs.

        So you see that the refcounting "works" for now, but only
        because the nsproxy (and therefore usr_ns) and user structs
        will be freed at the same time - when the last task using
        them is released.

        Now we need to put the user_ns in the struct user.  You can
        see that will mess up the refcounting.

        Fortunately, once the user_ns is available from tsk->user,
        we don't need it in nsproxy.

        So here is how the refcounting *should* be done:

        The task pins the user struct.

        The user struct pins its user namespace.

        The user namespace pins the struct user which created it.

        A user namespace now doesn't need to release its userids,
        because it is only released when its last user disappears.

Finally, this patch fixes a bug where not all credentials were
being reset for the first task in a new user namespace.  In
particular, if you did

	capset cap_sys_admin=ep ns_exec
        su - hallyn
          ns_exec -U /bin/sh
          id

then you'd see hallyn's userid and all preexisting groups.  With this patch,
cloning a new user namespace will set the task's uid and gid to 0, and reset
the group_info to the empty set assigned to init.

Changelog:
	Oct 07: Port on top of David Howell's credentials work.
        Aug 25: make free_user not inlined as it's not trivial.  (Eric
                Biederman suggestion)
        Aug 1: renamed user->user_namespace to user_ns, as the next
                patch did anyway.
        Aug 1: move put_user_ns call in one free_user() definition
                to move it outside the lock in free_user.  put_user_ns
                calls free_user on the user_ns->creator, which in
                turn would grab the lock again.

Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
---
 include/linux/init_task.h      |    1 -
 include/linux/nsproxy.h        |    1 -
 include/linux/sched.h          |    1 +
 include/linux/user_namespace.h |   11 +++----
 kernel/cred.c                  |    8 ++++-
 kernel/fork.c                  |    2 +-
 kernel/nsproxy.c               |   10 +-----
 kernel/sys.c                   |    4 +-
 kernel/user.c                  |   40 +++++++--------------------
 kernel/user_namespace.c        |   58 ++++++++++++++++++----------------------
 10 files changed, 53 insertions(+), 83 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index edf0285..126c3c4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy;
 	.mnt_ns		= NULL,						\
 	INIT_NET_NS(net_ns)                                             \
 	INIT_IPC_NS(ipc_ns)						\
-	.user_ns	= &init_user_ns,				\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index c8a768e..afad7de 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -27,7 +27,6 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
-	struct user_namespace *user_ns;
 	struct net 	     *net_ns;
 };
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 86ba19c..86acc5f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -595,6 +595,7 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
+	struct user_namespace *user_ns;
 
 #ifdef CONFIG_USER_SCHED
 	struct task_group *tg;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b5f41d4..03aedce 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,7 @@ struct user_namespace {
 	struct kref		kref;
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*root_user;
+	struct user_struct	*creator;
 };
 
 extern struct user_namespace init_user_ns;
@@ -26,8 +27,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return ns;
 }
 
-extern struct user_namespace *copy_user_ns(int flags,
-					   struct user_namespace *old_ns);
+extern int create_new_userns(int flags, struct task_struct *tsk);
 extern void free_user_ns(struct kref *kref);
 
 static inline void put_user_ns(struct user_namespace *ns)
@@ -43,13 +43,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return &init_user_ns;
 }
 
-static inline struct user_namespace *copy_user_ns(int flags,
-						  struct user_namespace *old_ns)
+static inline int create_new_userns(int flags, struct task_struct *tsk);
 {
 	if (flags & CLONE_NEWUSER)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
-	return old_ns;
+	return 0;
 }
 
 static inline void put_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 13697ca..67c9e8c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -341,9 +341,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
  * Always returns 0 thus allowing this function to be tail-called at the end
  * of, say, sys_setgid().
  */
-int commit_creds(struct cred *new)
+int __task_commit_creds(struct task_struct *task, struct cred *new)
 {
-	struct task_struct *task = current;
 	const struct cred *old;
 
 	BUG_ON(task->cred != task->real_cred);
@@ -405,6 +404,11 @@ int commit_creds(struct cred *new)
 	put_cred(old);
 	return 0;
 }
+
+int commit_creds(struct cred *new)
+{
+	return __task_commit_creds(current, new);
+}
 EXPORT_SYMBOL(commit_creds);
 
 /**
diff --git a/kernel/fork.c b/kernel/fork.c
index e65f424..a4eb85e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -933,7 +933,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->real_cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != current_user()->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29..766c672 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,11 +80,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_pid;
 	}
 
-	new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
-	if (IS_ERR(new_nsp->user_ns)) {
-		err = PTR_ERR(new_nsp->user_ns);
+	err = create_new_userns(flags, tsk);
+	if (err)
 		goto out_user;
-	}
 
 	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
@@ -95,8 +93,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	return new_nsp;
 
 out_net:
-	if (new_nsp->user_ns)
-		put_user_ns(new_nsp->user_ns);
 out_user:
 	if (new_nsp->pid_ns)
 		put_pid_ns(new_nsp->pid_ns);
@@ -173,8 +169,6 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
-	if (ns->user_ns)
-		put_user_ns(ns->user_ns);
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 68ce096..05ff7cc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -565,13 +565,13 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
+	new_user = alloc_uid(current_user()->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
-			new_user != current->nsproxy->user_ns->root_user) {
+			new_user != current_user()->user_ns->root_user) {
 		free_uid(new_user);
 		return -EAGAIN;
 	}
diff --git a/kernel/user.c b/kernel/user.c
index ed4dc57..94861ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -23,6 +23,7 @@ struct user_namespace init_user_ns = {
 		.refcount	= ATOMIC_INIT(2),
 	},
 	.root_user = &root_user,
+	.creator = &root_user,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
@@ -54,6 +55,7 @@ struct user_struct root_user = {
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
+	.user_ns	= &init_user_ns,
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -314,12 +316,13 @@ done:
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	/* restore back the count */
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
+	put_user_ns(up->user_ns);
 	INIT_WORK(&up->work, remove_user_sysfs_dir);
 	schedule_work(&up->work);
 }
@@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { }
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
+	put_user_ns(up->user_ns);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current->nsproxy->user_ns;
+	struct user_namespace *ns = current_user()->user_ns;
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		if (sched_create_user(new) < 0)
 			goto out_free_user;
 
+		new->user_ns = get_user_ns(ns);
+
 		if (uids_user_create(new))
 			goto out_destoy_sched;
 
@@ -436,6 +442,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 out_destoy_sched:
 	sched_destroy_user(new);
+	put_user_ns(new->user_ns);
 out_free_user:
 	kmem_cache_free(uid_cachep, new);
 out_unlock:
@@ -443,33 +450,6 @@ out_unlock:
 	return NULL;
 }
 
-#ifdef CONFIG_USER_NS
-void release_uids(struct user_namespace *ns)
-{
-	int i;
-	unsigned long flags;
-	struct hlist_head *head;
-	struct hlist_node *nd;
-
-	spin_lock_irqsave(&uidhash_lock, flags);
-	/*
-	 * collapse the chains so that the user_struct-s will
-	 * be still alive, but not in hashes. subsequent free_uid()
-	 * will free them.
-	 */
-	for (i = 0; i < UIDHASH_SZ; i++) {
-		head = ns->uidhash_table + i;
-		while (!hlist_empty(head)) {
-			nd = head->first;
-			hlist_del_init(nd);
-		}
-	}
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	free_uid(ns->root_user);
-}
-#endif
-
 static int __init uid_cache_init(void)
 {
 	int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d9c51d..8e00166 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,22 +9,30 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
+
+extern int __task_commit_creds(struct task_struct *tsk, struct cred *new);
 
 /*
  * Clone a new ns copying an original user ns, setting refcount to 1
  * @old_ns: namespace to clone
  * Return NULL on error (failure to kmalloc), new ns otherwise
+ * The caller (either clone or unshare) must set the target task's
+ * credentials accordingly.  We can't do it here, bc in the case of
+ * clone, current isn't the new task.
  */
-static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+int create_new_userns(int flags, struct task_struct *tsk)
 {
 	struct user_namespace *ns;
-	struct user_struct *new_user;
 	struct cred *new;
 	int n;
 
+	if (!(flags & CLONE_NEWUSER))
+		return 0;
+
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	kref_init(&ns->kref);
 
@@ -35,44 +43,30 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	ns->root_user = alloc_uid(ns, 0);
 	if (!ns->root_user) {
 		kfree(ns);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current_uid());
-	if (!new_user) {
-		free_uid(ns->root_user);
-		kfree(ns);
-		return ERR_PTR(-ENOMEM);
-	}
+	/* alloc_uid() incremented the userns refcount.  drop it again */
+	put_user_ns(ns);
+
+	/* save away and pin the creating user */
+	ns->creator = current_user();
+	atomic_inc(&ns->creator->__count);
 
-	/* Install the new user */
+	/* now create and commit the new credentials */
 	new = prepare_creds();
 	if (!new) {
-		free_uid(new_user);
 		free_uid(ns->root_user);
 		kfree(ns);
+		return -ENOMEM;
 	}
 	free_uid(new->user);
-	new->user = new_user;
-	commit_creds(new);
-	return ns;
-}
-
-struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
-{
-	struct user_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	get_user_ns(old_ns);
-
-	if (!(flags & CLONE_NEWUSER))
-		return old_ns;
-
-	new_ns = clone_user_ns(old_ns);
+	new->user = ns->root_user;
+	new->uid = new->euid = new->suid = new->fsuid = 0;
+	new->gid = new->egid = new->sgid = new->fsgid = 0;
+	__task_commit_creds(tsk, new);
 
-	put_user_ns(old_ns);
-	return new_ns;
+	return 0;
 }
 
 void free_user_ns(struct kref *kref)
@@ -80,7 +74,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	release_uids(ns);
+	free_uid(ns->creator);
 	kfree(ns);
 }
 EXPORT_SYMBOL(free_user_ns);
-- 
1.5.4.3

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list