[Devel] Re: [RFC][PATCH 3/5] Use pid namespace from struct pid_nrs list
Eric W. Biederman
ebiederm at xmission.com
Sun Mar 11 05:48:04 PDT 2007
sukadev at us.ibm.com writes:
> From: Sukadev Bhattiprolu <sukadev at us.ibm.com>
> Subject: [RFC][PATCH 3/5] Use pid namespace from struct pid_nrs list
>
> Stop using task->nsproxy->pid_ns. Use pid_namespace from pid->pid_nrs
> list instead.
>
> To simplify error handling, this patch moves processing of CLONE_NEWPID
> flag, currently in copy_namespaces()/copy_process(), to alloc_pid() which
> is where the process association with a pid namespace is established.
>
> i.e when cloning a new pid namespace, alloc_pid() allocates a new pid_nr
> for both the parent and child namespaces.
This patch seems to do a bit much, it is hard to follow what changes you
are making.
It probably makes sense to modify things so alloc_pid can do everything
it needs to.
It looks like we can safely move alloc_pid into copy_process and
just dig out the pid number and place it in nr if copy_process succeeds.
Which should allow the special case for setting the child reaper to go
away, because we can allocate the task_struct before allocating the struct
pid.
> Changelog:
> - Move definition of set_pid_ns_child_reaper() into the previous
> helper function patch.
> - Minor changes to accomodate changes in underlying patches.
> - Fix compile warnings about parent_pid_ns (Cedric Le Goater)
> - [Badari Pulavarty's comments]: No need to allocate nsproxy when
> only CLONE_NEWPID is set. And attach_pid_nr() only needs one
> parameter.
> - Add privilege check for clone(CLONE_NEWPID).
>
> Signed-off-by: Sukadev Bhattiprolu <sukadev at us.ibm.com>
> Cc: Cedric Le Goater <clg at fr.ibm.com>
> Cc: Dave Hansen <haveblue at us.ibm.com>
> Cc: Serge Hallyn <serue at us.ibm.com>
> Cc: containers at lists.osdl.org
>
> ---
> include/linux/pid.h | 2 -
> include/linux/pid_namespace.h | 11 ++-------
> kernel/fork.c | 12 ++++++---
> kernel/nsproxy.c | 11 ---------
> kernel/pid.c | 51 ++++++++++++++++++++++++++++++------------
> 5 files changed, 50 insertions(+), 37 deletions(-)
>
> Index: lx26-20-mm2b/include/linux/pid.h
> ===================================================================
> --- lx26-20-mm2b.orig/include/linux/pid.h 2007-03-09 19:00:11.000000000 -0800
> +++ lx26-20-mm2b/include/linux/pid.h 2007-03-09 19:01:09.000000000 -0800
> @@ -119,7 +119,7 @@ extern struct pid *find_ge_pid(int nr);
> extern int attach_pid_nr(struct pid *pid, struct pid_nr *pid_nr);
> extern void free_pid_nr(struct pid_nr *pid_nr);
> extern struct pid_nr *alloc_pid_nr(struct pid_namespace *pid_ns);
> -extern struct pid *alloc_pid(void);
> +extern struct pid *alloc_pid(int clone_flags);
> extern void FASTCALL(free_pid(struct pid *pid));
> extern pid_t pid_nr(struct pid *pid);
>
> Index: lx26-20-mm2b/kernel/fork.c
> ===================================================================
> --- lx26-20-mm2b.orig/kernel/fork.c 2007-03-09 19:00:42.000000000 -0800
> +++ lx26-20-mm2b/kernel/fork.c 2007-03-09 19:01:09.000000000 -0800
> @@ -1144,6 +1144,8 @@ static struct task_struct *copy_process(
> if (clone_flags & CLONE_THREAD)
> p->tgid = current->tgid;
>
> + if ((retval = priv_check_pid_ns(clone_flags)))
> + goto bad_fork_cleanup_policy;
> if ((retval = security_task_alloc(p)))
> goto bad_fork_cleanup_policy;
> if ((retval = audit_alloc(p)))
> @@ -1169,6 +1171,9 @@ static struct task_struct *copy_process(
> if (retval)
> goto bad_fork_cleanup_namespaces;
>
> + /* We are now ready to set child reaper if we cloned pid ns */
> + set_pid_ns_child_reaper(clone_flags, pid, p);
> +
> p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr :
> NULL;
> /*
> * Clear TID on mm_release()?
> @@ -1384,7 +1389,7 @@ long do_fork(unsigned long clone_flags,
> int __user *child_tidptr)
> {
> struct task_struct *p;
> - struct pid *pid = alloc_pid();
> + struct pid *pid = alloc_pid(clone_flags);
> long nr;
>
> if (!pid)
> @@ -1671,7 +1676,7 @@ asmlinkage long sys_unshare(unsigned lon
> if ((err = unshare_pid_ns(unshare_flags, &new_pid_nr)))
> goto bad_unshare_cleanup_ipc;
>
> - if (new_ns || new_uts || new_ipc || new_pid_nr) {
> + if (new_ns || new_uts || new_ipc) {
> old_nsproxy = current->nsproxy;
> new_nsproxy = dup_nsproxy(old_nsproxy);
> if (!new_nsproxy) {
> @@ -1730,8 +1735,7 @@ asmlinkage long sys_unshare(unsigned lon
> }
>
> if (new_pid_nr) {
> - pid = task_pid_ns(current);
> - set_task_pid_ns(current, new_pid_nr->pid_ns);
> + attach_pid_nr(task_pid(current), new_pid_nr);
> new_pid_nr = NULL;
> }
>
> Index: lx26-20-mm2b/kernel/nsproxy.c
> ===================================================================
> --- lx26-20-mm2b.orig/kernel/nsproxy.c 2007-03-09 19:00:14.000000000 -0800
> +++ lx26-20-mm2b/kernel/nsproxy.c 2007-03-09 19:01:09.000000000 -0800
> @@ -67,8 +67,6 @@ struct nsproxy *dup_nsproxy(struct nspro
> get_uts_ns(ns->uts_ns);
> if (ns->ipc_ns)
> get_ipc_ns(ns->ipc_ns);
> - if (ns->pid_ns)
> - get_pid_ns(ns->pid_ns);
> }
>
> return ns;
> @@ -90,7 +88,7 @@ int copy_nsproxy(int flags, struct task_
>
> get_nsproxy(old_ns);
>
> - ns_all = CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID;
> + ns_all = CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC;
>
> if (!(flags & ns_all))
> return 0;
> @@ -115,17 +113,10 @@ int copy_nsproxy(int flags, struct task_
> if (err)
> goto out_ipc;
>
> - err = copy_pid_ns(flags, tsk);
> - if (err)
> - goto out_pid;
> -
> out:
> put_nsproxy(old_ns);
> return err;
>
> -out_pid:
> - if (new_ns->ipc_ns)
> - put_ipc_ns(new_ns->ipc_ns);
> out_ipc:
> if (new_ns->uts_ns)
> put_uts_ns(new_ns->uts_ns);
> Index: lx26-20-mm2b/kernel/pid.c
> ===================================================================
> --- lx26-20-mm2b.orig/kernel/pid.c 2007-03-09 19:00:42.000000000 -0800
> +++ lx26-20-mm2b/kernel/pid.c 2007-03-09 19:01:09.000000000 -0800
> @@ -221,8 +221,13 @@ fastcall void free_pid(struct pid *pid)
> hlist_del_rcu(&pid->pid_chain);
> spin_unlock_irqrestore(&pidmap_lock, flags);
>
> - hlist_for_each_entry(pid_nr, pos, &pid->pid_nrs, node)
> + hlist_for_each_entry(pid_nr, pos, &pid->pid_nrs, node) {
> free_pidmap(pid_nr->pid_ns, pid_nr->nr);
> +
> + /* put the reference we got in kref_init() in clone_pid_ns() */
> + if (pid_nr->nr == 1)
> + put_pid_ns(pid_nr->pid_ns);
Ok. This seems to make sense, but why restrict this to only pid 1?
I'm almost certain this will be the case, but... this seems a like
a unwarranted special case at the moment.
Basically why is it safe to restrict this to pid == 1. Is it possible
that we can race here?
> + }
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> @@ -357,34 +362,48 @@ struct pid_namespace *pid_ns(struct pid
> return ns;
> }
>
> -struct pid *alloc_pid(void)
> +struct pid *alloc_pid(int flags)
> {
> struct pid *pid;
> enum pid_type type;
> - int nr = -1;
> - struct pid_nr *pid_nr;
> + struct pid_nr *pid_nr[2] = { NULL, NULL};
I would rather not see pid_nr special cased this way at all (a loop?)
but if we are going to I think two separate variables makes more
sense than this array.
> + struct pid_namespace *new_pid_ns = NULL;
>
> pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
> if (!pid)
> return NULL;
>
> - nr = alloc_pidmap(task_pid_ns(current));
> - if (nr < 0)
> + pid_nr[0] = alloc_pidmap_pid_nr(task_pid_ns(current));
> + if (!pid_nr[0] < 0)
> goto out_free_pid;
>
> - pid_nr = alloc_pid_nr(task_pid_ns(current));
> - if (!pid_nr)
> - goto out_free_pidmap;
> -
> + if (flags & CLONE_NEWPID) {
> + new_pid_ns = clone_pid_ns();
> + if (!new_pid_ns)
> + goto out_free_pid_nr0;
> + /*
> + * For now, allocate a pid_nr only for the new pid namespace.
> + * Eventually we should allocate a pid_nr for each ancestor
> + * namespace. While this could cost us additional memory in
> + * deeply nested containers, it would allow us to see/signal
> + * all processes from init-pid-ns.
> + */
> + pid_nr[1] = alloc_pidmap_pid_nr(new_pid_ns);
> + if (!pid_nr[1])
> + goto out_free_pid_ns;
> + }
> atomic_set(&pid->count, 1);
> - pid->nr = pid_nr->nr = nr; /* pid->nr to be removed soon */
> + pid->nr = pid_nr[0]->nr; /* pid->nr to be removed soon */
> for (type = 0; type < PIDTYPE_MAX; ++type)
> INIT_HLIST_HEAD(&pid->tasks[type]);
>
> spin_lock_init(&pid->lock);
>
> INIT_HLIST_HEAD(&pid->pid_nrs);
> - hlist_add_head_rcu(&pid_nr->node, &pid->pid_nrs);
> + hlist_add_head_rcu(&pid_nr[0]->node, &pid->pid_nrs);
> +
> + if (pid_nr[1])
> + hlist_add_head_rcu(&pid_nr[1]->node, &pid->pid_nrs);
>
> spin_lock_irq(&pidmap_lock);
> hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
> @@ -392,11 +411,15 @@ struct pid *alloc_pid(void)
>
> return pid;
>
> -out_free_pidmap:
> - free_pidmap(task_pid_ns(current), nr);
> +out_free_pid_ns:
> + put_pid_ns(new_pid_ns);
> +
> +out_free_pid_nr0:
> + free_pidmap_pid_nr(pid_nr[0]);
>
> out_free_pid:
> kmem_cache_free(pid_cachep, pid);
> +
> return NULL;
> }
>
> Index: lx26-20-mm2b/include/linux/pid_namespace.h
> ===================================================================
> --- lx26-20-mm2b.orig/include/linux/pid_namespace.h 2007-03-09
> 19:00:11.000000000 -0800
> +++ lx26-20-mm2b/include/linux/pid_namespace.h 2007-03-09 19:01:09.000000000
> -0800
> @@ -33,6 +33,7 @@ extern int unshare_pid_ns(unsigned long
> struct pid_nr **new_pid_nr);
> extern int copy_pid_ns(int flags, struct task_struct *tsk);
> extern void free_pid_ns(struct kref *kref);
> +extern struct pid_namespace *pid_ns(struct pid * pid);
>
> static inline void put_pid_ns(struct pid_namespace *ns)
> {
> @@ -41,18 +42,12 @@ static inline void put_pid_ns(struct pid
>
> static inline struct pid_namespace *task_pid_ns(struct task_struct *tsk)
> {
> - return tsk->nsproxy->pid_ns;
> -}
> -
> -static inline void set_task_pid_ns(struct task_struct *tsk,
> - struct pid_namespace * ns)
> -{
> - tsk->nsproxy->pid_ns = ns
> + return pid_ns(task_pid(tsk));
> }
>
> static inline struct task_struct *child_reaper(struct task_struct *tsk)
> {
> - return init_pid_ns.child_reaper;
> + return task_pid_ns(tsk)->child_reaper;
> }
>
> #endif /* _LINUX_PID_NS_H */
Eric
_______________________________________________
Containers mailing list
Containers at lists.osdl.org
https://lists.osdl.org/mailman/listinfo/containers
More information about the Devel
mailing list