[Devel] [PATCH 1/1] User namespaces: fix refcounting
Serge E. Hallyn
serue at us.ibm.com
Wed Oct 8 07:42:46 PDT 2008
Hi David,
attached is a respin of the three user namespace cleanup patches I've
been trying to push upstream. I've ported them on top of James'
next-creds-subsys branch (and consolidated into a single patch bc the
splitup wasn't perfect anyway).
These are prep patches on top of which I can then start the real user
namespace work. If this looks ok, then I will try to get it into
linux-next after next-creds-subsys goes in there.
Perhaps the most objectionable part of this to you may be the
__task_commit_creds(). In the case of clone(CLONE_NEWUSER),
current is not the task which will receive the new credentials, so
I need to be able to set creds on another task. The other task is
however still being created, so it is safe in this case. I kept
__task_commit_creds out of header files assuming that noone else
should be able to use it, ever.
Does this look ok to you?
thanks,
-serge
>From b09c70161f58e1d74e84415f29bfaf124367b641 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue at us.ibm.com>
Date: Tue, 7 Oct 2008 20:54:59 -0500
Subject: [PATCH 1/1] User namespaces: fix refcounting
When a task does clone(CLONE_NEWNS), the task's user is the 'creator' of the
new user_namespace, and the user_namespace is tacked onto a list of those
created by this user.
Move the user namespace from the nsproxy to the struct user_struct, because
1. otherwise a struct cred or struct user_struct is insufficient to
unambiguously determine uid equivalence
2. this nicely fits the hierarchical user namespace+struct user_struct
model.
When a user_namespace is created, the user which created it is
marked as its 'creator'. The user_namespace pins the creator.
Each userid in a user_ns pins the user_ns. This keeps refcounting
nice and simple.
At the same time, this patch simplifies the refcounting. The current
user and userns locking works as follows:
The task pins the user struct.
The task pins the nsproxy, the nsproxy pins the user_ns.
When a new user_ns is created, it creates a root user for
it, and pins it. When the nsproxy releases the user_ns,
the userns tries to release all its user structs.
So you see that the refcounting "works" for now, but only
because the nsproxy (and therefore usr_ns) and user structs
will be freed at the same time - when the last task using
them is released.
Now we need to put the user_ns in the struct user. You can
see that will mess up the refcounting.
Fortunately, once the user_ns is available from tsk->user,
we don't need it in nsproxy.
So here is how the refcounting *should* be done:
The task pins the user struct.
The user struct pins its user namespace.
The user namespace pins the struct user which created it.
A user namespace now doesn't need to release its userids,
because it is only released when its last user disappears.
Finally, this patch fixes a bug where not all credentials were
being reset for the first task in a new user namespace. In
particular, if you did
capset cap_sys_admin=ep ns_exec
su - hallyn
ns_exec -U /bin/sh
id
then you'd see hallyn's userid and all preexisting groups. With this patch,
cloning a new user namespace will set the task's uid and gid to 0, and reset
the group_info to the empty set assigned to init.
Changelog:
Oct 07: Port on top of David Howell's credentials work.
Aug 25: make free_user not inlined as it's not trivial. (Eric
Biederman suggestion)
Aug 1: renamed user->user_namespace to user_ns, as the next
patch did anyway.
Aug 1: move put_user_ns call in one free_user() definition
to move it outside the lock in free_user. put_user_ns
calls free_user on the user_ns->creator, which in
turn would grab the lock again.
Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
---
include/linux/init_task.h | 1 -
include/linux/nsproxy.h | 1 -
include/linux/sched.h | 1 +
include/linux/user_namespace.h | 11 +++----
kernel/cred.c | 8 ++++-
kernel/fork.c | 2 +-
kernel/nsproxy.c | 10 +-----
kernel/sys.c | 4 +-
kernel/user.c | 40 +++++++--------------------
kernel/user_namespace.c | 58 ++++++++++++++++++----------------------
10 files changed, 53 insertions(+), 83 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index edf0285..126c3c4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy;
.mnt_ns = NULL, \
INIT_NET_NS(net_ns) \
INIT_IPC_NS(ipc_ns) \
- .user_ns = &init_user_ns, \
}
#define INIT_SIGHAND(sighand) { \
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index c8a768e..afad7de 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -27,7 +27,6 @@ struct nsproxy {
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns;
- struct user_namespace *user_ns;
struct net *net_ns;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 86ba19c..86acc5f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -595,6 +595,7 @@ struct user_struct {
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
+ struct user_namespace *user_ns;
#ifdef CONFIG_USER_SCHED
struct task_group *tg;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b5f41d4..03aedce 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,7 @@ struct user_namespace {
struct kref kref;
struct hlist_head uidhash_table[UIDHASH_SZ];
struct user_struct *root_user;
+ struct user_struct *creator;
};
extern struct user_namespace init_user_ns;
@@ -26,8 +27,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
return ns;
}
-extern struct user_namespace *copy_user_ns(int flags,
- struct user_namespace *old_ns);
+extern int create_new_userns(int flags, struct task_struct *tsk);
extern void free_user_ns(struct kref *kref);
static inline void put_user_ns(struct user_namespace *ns)
@@ -43,13 +43,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
return &init_user_ns;
}
-static inline struct user_namespace *copy_user_ns(int flags,
- struct user_namespace *old_ns)
+static inline int create_new_userns(int flags, struct task_struct *tsk);
{
if (flags & CLONE_NEWUSER)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
- return old_ns;
+ return 0;
}
static inline void put_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 13697ca..67c9e8c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -341,9 +341,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
* Always returns 0 thus allowing this function to be tail-called at the end
* of, say, sys_setgid().
*/
-int commit_creds(struct cred *new)
+int __task_commit_creds(struct task_struct *task, struct cred *new)
{
- struct task_struct *task = current;
const struct cred *old;
BUG_ON(task->cred != task->real_cred);
@@ -405,6 +404,11 @@ int commit_creds(struct cred *new)
put_cred(old);
return 0;
}
+
+int commit_creds(struct cred *new)
+{
+ return __task_commit_creds(current, new);
+}
EXPORT_SYMBOL(commit_creds);
/**
diff --git a/kernel/fork.c b/kernel/fork.c
index e65f424..a4eb85e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -933,7 +933,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (atomic_read(&p->real_cred->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
- p->real_cred->user != current->nsproxy->user_ns->root_user)
+ p->real_cred->user != current_user()->user_ns->root_user)
goto bad_fork_free;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29..766c672 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,11 +80,9 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}
- new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
- if (IS_ERR(new_nsp->user_ns)) {
- err = PTR_ERR(new_nsp->user_ns);
+ err = create_new_userns(flags, tsk);
+ if (err)
goto out_user;
- }
new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
@@ -95,8 +93,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
out_net:
- if (new_nsp->user_ns)
- put_user_ns(new_nsp->user_ns);
out_user:
if (new_nsp->pid_ns)
put_pid_ns(new_nsp->pid_ns);
@@ -173,8 +169,6 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns)
put_pid_ns(ns->pid_ns);
- if (ns->user_ns)
- put_user_ns(ns->user_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 68ce096..05ff7cc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -565,13 +565,13 @@ static int set_user(struct cred *new)
{
struct user_struct *new_user;
- new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
+ new_user = alloc_uid(current_user()->user_ns, new->uid);
if (!new_user)
return -EAGAIN;
if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
- new_user != current->nsproxy->user_ns->root_user) {
+ new_user != current_user()->user_ns->root_user) {
free_uid(new_user);
return -EAGAIN;
}
diff --git a/kernel/user.c b/kernel/user.c
index ed4dc57..94861ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -23,6 +23,7 @@ struct user_namespace init_user_ns = {
.refcount = ATOMIC_INIT(2),
},
.root_user = &root_user,
+ .creator = &root_user,
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -54,6 +55,7 @@ struct user_struct root_user = {
.files = ATOMIC_INIT(0),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
+ .user_ns = &init_user_ns,
#ifdef CONFIG_USER_SCHED
.tg = &init_task_group,
#endif
@@ -314,12 +316,13 @@ done:
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
{
/* restore back the count */
atomic_inc(&up->__count);
spin_unlock_irqrestore(&uidhash_lock, flags);
+ put_user_ns(up->user_ns);
INIT_WORK(&up->work, remove_user_sysfs_dir);
schedule_work(&up->work);
}
@@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { }
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
+ put_user_ns(up->user_ns);
kmem_cache_free(uid_cachep, up);
}
@@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
{
struct user_struct *ret;
unsigned long flags;
- struct user_namespace *ns = current->nsproxy->user_ns;
+ struct user_namespace *ns = current_user()->user_ns;
spin_lock_irqsave(&uidhash_lock, flags);
ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
if (sched_create_user(new) < 0)
goto out_free_user;
+ new->user_ns = get_user_ns(ns);
+
if (uids_user_create(new))
goto out_destoy_sched;
@@ -436,6 +442,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
out_destoy_sched:
sched_destroy_user(new);
+ put_user_ns(new->user_ns);
out_free_user:
kmem_cache_free(uid_cachep, new);
out_unlock:
@@ -443,33 +450,6 @@ out_unlock:
return NULL;
}
-#ifdef CONFIG_USER_NS
-void release_uids(struct user_namespace *ns)
-{
- int i;
- unsigned long flags;
- struct hlist_head *head;
- struct hlist_node *nd;
-
- spin_lock_irqsave(&uidhash_lock, flags);
- /*
- * collapse the chains so that the user_struct-s will
- * be still alive, but not in hashes. subsequent free_uid()
- * will free them.
- */
- for (i = 0; i < UIDHASH_SZ; i++) {
- head = ns->uidhash_table + i;
- while (!hlist_empty(head)) {
- nd = head->first;
- hlist_del_init(nd);
- }
- }
- spin_unlock_irqrestore(&uidhash_lock, flags);
-
- free_uid(ns->root_user);
-}
-#endif
-
static int __init uid_cache_init(void)
{
int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d9c51d..8e00166 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,22 +9,30 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/cred.h>
+
+extern int __task_commit_creds(struct task_struct *tsk, struct cred *new);
/*
* Clone a new ns copying an original user ns, setting refcount to 1
* @old_ns: namespace to clone
* Return NULL on error (failure to kmalloc), new ns otherwise
+ * The caller (either clone or unshare) must set the target task's
+ * credentials accordingly. We can't do it here, bc in the case of
+ * clone, current isn't the new task.
*/
-static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+int create_new_userns(int flags, struct task_struct *tsk)
{
struct user_namespace *ns;
- struct user_struct *new_user;
struct cred *new;
int n;
+ if (!(flags & CLONE_NEWUSER))
+ return 0;
+
ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
if (!ns)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
kref_init(&ns->kref);
@@ -35,44 +43,30 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
ns->root_user = alloc_uid(ns, 0);
if (!ns->root_user) {
kfree(ns);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
- /* Reset current->user with a new one */
- new_user = alloc_uid(ns, current_uid());
- if (!new_user) {
- free_uid(ns->root_user);
- kfree(ns);
- return ERR_PTR(-ENOMEM);
- }
+ /* alloc_uid() incremented the userns refcount. drop it again */
+ put_user_ns(ns);
+
+ /* save away and pin the creating user */
+ ns->creator = current_user();
+ atomic_inc(&ns->creator->__count);
- /* Install the new user */
+ /* now create and commit the new credentials */
new = prepare_creds();
if (!new) {
- free_uid(new_user);
free_uid(ns->root_user);
kfree(ns);
+ return -ENOMEM;
}
free_uid(new->user);
- new->user = new_user;
- commit_creds(new);
- return ns;
-}
-
-struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
-{
- struct user_namespace *new_ns;
-
- BUG_ON(!old_ns);
- get_user_ns(old_ns);
-
- if (!(flags & CLONE_NEWUSER))
- return old_ns;
-
- new_ns = clone_user_ns(old_ns);
+ new->user = ns->root_user;
+ new->uid = new->euid = new->suid = new->fsuid = 0;
+ new->gid = new->egid = new->sgid = new->fsgid = 0;
+ __task_commit_creds(tsk, new);
- put_user_ns(old_ns);
- return new_ns;
+ return 0;
}
void free_user_ns(struct kref *kref)
@@ -80,7 +74,7 @@ void free_user_ns(struct kref *kref)
struct user_namespace *ns;
ns = container_of(kref, struct user_namespace, kref);
- release_uids(ns);
+ free_uid(ns->creator);
kfree(ns);
}
EXPORT_SYMBOL(free_user_ns);
--
1.5.4.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list