[Devel] Re: [PATCH linux-cr] nested pid namespaces (v2)

Louis Rilling Louis.Rilling at kerlabs.com
Mon Mar 22 03:55:03 PDT 2010


On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.  At
> Oren's request here is an alternative to my previous implementation.  In
> this one, we keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns depth
> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone().  Kernel ignores them.
> 
> All cr_tests including the new pid_ns testcase pass.
> 

IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
be re-worked a bit in order to not impose an artificial limit of
CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
suggested changes below).

It would probably be safer too to use task_active_pid_ns() instead of
task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
by Eric makes it to mainline.

Thanks,

Louis

> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
>  checkpoint/checkpoint.c          |  113 ++++++++++++++++++++++++++++++++++----
>  checkpoint/process.c             |   18 +++++-
>  checkpoint/restart.c             |   45 ++++++++++++++-
>  checkpoint/sys.c                 |    2 +
>  include/linux/checkpoint.h       |    2 +-
>  include/linux/checkpoint_hdr.h   |   16 +++++
>  include/linux/checkpoint_types.h |    3 +
>  kernel/nsproxy.c                 |    9 ++-
>  8 files changed, 186 insertions(+), 22 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fe3546a 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>  
>  /* unique checkpoint identifier (FIXME: should be per-container ?) */
>  static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  	struct task_struct *root = ctx->root_task;
>  	struct nsproxy *nsproxy;
>  	int ret = 0;
> +	struct pid_namespace *pidns;
>  
>  	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>  
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>  		ret = -EPERM;
>  	}
> -	/* no support for >1 private pidns */
> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -		ret = -EPERM;
> +	/* pidns must be descendent of root_nsproxy */
> +	pidns = nsproxy->pid_ns;
> +	while (pidns != ctx->root_nsproxy->pid_ns) {
> +		if (pidns == &init_pid_ns) {
> +			ret = -EPERM;
> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +			break;
> +		}
> +		pidns = pidns->parent;
>  	}
>  	rcu_read_unlock();
>  
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  
>  #define CKPT_HDR_PIDS_CHUNK	256
>  
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
>  static int checkpoint_pids(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_pids *h;
> -	struct pid_namespace *ns;
> +	struct pid_namespace *root_pidns;
>  	struct task_struct *task;
>  	struct task_struct **tasks_arr;
>  	int nr_tasks, n, pos = 0, ret = 0;
>  
> -	ns = ctx->root_nsproxy->pid_ns;
> +	root_pidns = ctx->root_nsproxy->pid_ns;
>  	tasks_arr = ctx->tasks_arr;
>  	nr_tasks = ctx->nr_tasks;
>  	BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	do {
>  		rcu_read_lock();
>  		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> +			struct pid_namespace *task_pidns;
>  			task = tasks_arr[pos];
>  
> -			h[n].vpid = task_pid_nr_ns(task, ns);
> -			h[n].vtgid = task_tgid_nr_ns(task, ns);
> -			h[n].vpgid = task_pgrp_nr_ns(task, ns);
> -			h[n].vsid = task_session_nr_ns(task, ns);
> -			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> +			h[n].vpid = task_pid_nr_ns(task, root_pidns);
> +			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> +			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> +			h[n].vsid = task_session_nr_ns(task, root_pidns);
> +			h[n].vppid = task_tgid_nr_ns(task->real_parent,
> +					root_pidns);
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			h[n].rpid = task_pid_vnr(task);
> +			h[n].depth = task_pidns->level - root_pidns->level;
>  			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
>  				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> +			ctx->nr_vpids += h[n].depth;
>  			pos++;
>  		}
>  		rcu_read_unlock();
> @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_vpid *h;
> +	struct pid_namespace *root_pidns, *task_pidns;
> +	struct task_struct *task;
> +	int ret, nr_tasks = ctx->nr_tasks;
> +	int tidx = 0, /* index into task array */
> +		hidx = 0; /* pids written into current ckpt_vpids chunk */
> +
> +	root_pidns = ctx->root_nsproxy->pid_ns;
> +	nr_tasks = ctx->nr_tasks;
> +
> +	ret = ckpt_write_obj_type(ctx, NULL,
> +				  sizeof(*h) * ctx->nr_vpids,
> +				  CKPT_HDR_BUFFER);
> +	if (ret < 0)
> +		return ret;
> +
> +	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	do {
> +		rcu_read_lock();
> +		while (tidx < nr_tasks) {
> +			int vidx; /* vpid index */
> +			int nsdelta;
> +
> +			task = ctx->tasks_arr[tidx];
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			nsdelta = task_pidns->level - root_pidns->level;
> +			if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)

I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit.

> +				break;
> +
> +			for (vidx = 0; vidx < nsdelta; vidx++) {
> +				h[vidx].pid = task_pid_nr_ns(task, task_pidns);

Here:
				h[hidx + vidx]

> +				task_pidns = task_pidns->parent;
> +			}
> +
> +			hidx += nsdelta;
> +			tidx++;
> +		}
> +		rcu_read_unlock();
> +
> +		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> +		if (ret < 0)
> +			break;
> +
> +		hidx = 0;
> +	} while (tidx < nr_tasks);
> +
> +	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	return ret;
> +}

Maybe re-work it this way:

static int checkpoint_vpids(struct ckpt_ctx *ctx)
{
	struct ckpt_vpid *h;
	struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
	struct task_struct *task;
	int ret, nr_tasks = ctx->nr_tasks;
	int tidx = 0, /* index into task array */
		hidx = 0; /* pids written into current ckpt_vpids chunk */
		vidx = 0; /* vpid index for current task */

	root_pidns = ctx->root_nsproxy->pid_ns;
	nr_tasks = ctx->nr_tasks;

	ret = ckpt_write_obj_type(ctx, NULL,
				  sizeof(*h) * ctx->nr_vpids,
				  CKPT_HDR_BUFFER);
	if (ret < 0)
		return ret;

	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	if (!h)
		return -ENOMEM;

	do {
		rcu_read_lock();
		while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
			int nsdelta;

			task = ctx->tasks_arr[tidx];
			active_pidns = task_active_pid_ns(task);
			nsdelta = active_pidns->level - root_pidns->level;
			if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
				/*
				 * We will release rcu before recording the
				 * remaining vpids, but neither task nor its
				 * pid can disappear.
				 */
				nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;

			if (vidx == 0)
				task_pidns = active_pidns;
			for (; vidx < nsdelta; vidx++) {
				h[hidx].pid = task_pid_nr_ns(task, task_pidns);
				hidx++;
				task_pidns = task_pidns->parent;
			}

			if (task_pidns == root_pidns) {
				tidx++;
				vidx = 0;
			}
		}
		rcu_read_unlock();

		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
		if (ret < 0)
			break;

		hidx = 0;
	} while (tidx < nr_tasks);

	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	return ret;
}

[...]

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 197 bytes
Desc: Digital signature
URL: <http://lists.openvz.org/pipermail/devel/attachments/20100322/22493c3b/attachment-0001.sig>
-------------- next part --------------
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers


More information about the Devel mailing list