[Devel] Re: [PATCH linux-cr] nested pid namespaces (v2)
Louis Rilling
Louis.Rilling at kerlabs.com
Mon Mar 22 03:55:03 PDT 2010
On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces. At
> Oren's request here is an alternative to my previous implementation. In
> this one, we keep the original single pids_array to minimize memory
> allocations. The pids array entries are augmented with a pidns depth
> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists). The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint. If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone(). Kernel ignores them.
>
> All cr_tests including the new pid_ns testcase pass.
>
IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
be re-worked a bit in order to not impose an artificial limit of
CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
suggested changes below).
It would probably be safer too to use task_active_pid_ns() instead of
task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
by Eric makes it to mainline.
Thanks,
Louis
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
> checkpoint/checkpoint.c | 113 ++++++++++++++++++++++++++++++++++----
> checkpoint/process.c | 18 +++++-
> checkpoint/restart.c | 45 ++++++++++++++-
> checkpoint/sys.c | 2 +
> include/linux/checkpoint.h | 2 +-
> include/linux/checkpoint_hdr.h | 16 +++++
> include/linux/checkpoint_types.h | 3 +
> kernel/nsproxy.c | 9 ++-
> 8 files changed, 186 insertions(+), 22 deletions(-)
>
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fe3546a 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
> #include <linux/deferqueue.h>
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>
> /* unique checkpoint identifier (FIXME: should be per-container ?) */
> static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> struct task_struct *root = ctx->root_task;
> struct nsproxy *nsproxy;
> int ret = 0;
> + struct pid_namespace *pidns;
>
> ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
> ret = -EPERM;
> }
> - /* no support for >1 private pidns */
> - if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> - _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> - ret = -EPERM;
> + /* pidns must be descendent of root_nsproxy */
> + pidns = nsproxy->pid_ns;
> + while (pidns != ctx->root_nsproxy->pid_ns) {
> + if (pidns == &init_pid_ns) {
> + ret = -EPERM;
> + _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> + break;
> + }
> + pidns = pidns->parent;
> }
> rcu_read_unlock();
>
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>
> #define CKPT_HDR_PIDS_CHUNK 256
>
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns. This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
> static int checkpoint_pids(struct ckpt_ctx *ctx)
> {
> struct ckpt_pids *h;
> - struct pid_namespace *ns;
> + struct pid_namespace *root_pidns;
> struct task_struct *task;
> struct task_struct **tasks_arr;
> int nr_tasks, n, pos = 0, ret = 0;
>
> - ns = ctx->root_nsproxy->pid_ns;
> + root_pidns = ctx->root_nsproxy->pid_ns;
> tasks_arr = ctx->tasks_arr;
> nr_tasks = ctx->nr_tasks;
> BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> do {
> rcu_read_lock();
> for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> + struct pid_namespace *task_pidns;
> task = tasks_arr[pos];
>
> - h[n].vpid = task_pid_nr_ns(task, ns);
> - h[n].vtgid = task_tgid_nr_ns(task, ns);
> - h[n].vpgid = task_pgrp_nr_ns(task, ns);
> - h[n].vsid = task_session_nr_ns(task, ns);
> - h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> + h[n].vpid = task_pid_nr_ns(task, root_pidns);
> + h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> + h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> + h[n].vsid = task_session_nr_ns(task, root_pidns);
> + h[n].vppid = task_tgid_nr_ns(task->real_parent,
> + root_pidns);
> + task_pidns = task_nsproxy(task)->pid_ns;
> + h[n].rpid = task_pid_vnr(task);
> + h[n].depth = task_pidns->level - root_pidns->level;
> ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
> pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> + ctx->nr_vpids += h[n].depth;
> pos++;
> }
> rcu_read_unlock();
> @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> return ret;
> }
>
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> + struct ckpt_vpid *h;
> + struct pid_namespace *root_pidns, *task_pidns;
> + struct task_struct *task;
> + int ret, nr_tasks = ctx->nr_tasks;
> + int tidx = 0, /* index into task array */
> + hidx = 0; /* pids written into current ckpt_vpids chunk */
> +
> + root_pidns = ctx->root_nsproxy->pid_ns;
> + nr_tasks = ctx->nr_tasks;
> +
> + ret = ckpt_write_obj_type(ctx, NULL,
> + sizeof(*h) * ctx->nr_vpids,
> + CKPT_HDR_BUFFER);
> + if (ret < 0)
> + return ret;
> +
> + h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + if (!h)
> + return -ENOMEM;
> +
> + do {
> + rcu_read_lock();
> + while (tidx < nr_tasks) {
> + int vidx; /* vpid index */
> + int nsdelta;
> +
> + task = ctx->tasks_arr[tidx];
> + task_pidns = task_nsproxy(task)->pid_ns;
> + nsdelta = task_pidns->level - root_pidns->level;
> + if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)
I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit.
> + break;
> +
> + for (vidx = 0; vidx < nsdelta; vidx++) {
> + h[vidx].pid = task_pid_nr_ns(task, task_pidns);
Here:
h[hidx + vidx]
> + task_pidns = task_pidns->parent;
> + }
> +
> + hidx += nsdelta;
> + tidx++;
> + }
> + rcu_read_unlock();
> +
> + ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> + if (ret < 0)
> + break;
> +
> + hidx = 0;
> + } while (tidx < nr_tasks);
> +
> + _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> + return ret;
> +}
Maybe re-work it this way:
static int checkpoint_vpids(struct ckpt_ctx *ctx)
{
struct ckpt_vpid *h;
struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
struct task_struct *task;
int ret, nr_tasks = ctx->nr_tasks;
int tidx = 0, /* index into task array */
hidx = 0; /* pids written into current ckpt_vpids chunk */
vidx = 0; /* vpid index for current task */
root_pidns = ctx->root_nsproxy->pid_ns;
nr_tasks = ctx->nr_tasks;
ret = ckpt_write_obj_type(ctx, NULL,
sizeof(*h) * ctx->nr_vpids,
CKPT_HDR_BUFFER);
if (ret < 0)
return ret;
h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
if (!h)
return -ENOMEM;
do {
rcu_read_lock();
while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
int nsdelta;
task = ctx->tasks_arr[tidx];
active_pidns = task_active_pid_ns(task);
nsdelta = active_pidns->level - root_pidns->level;
if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
/*
* We will release rcu before recording the
* remaining vpids, but neither task nor its
* pid can disappear.
*/
nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;
if (vidx == 0)
task_pidns = active_pidns;
for (; vidx < nsdelta; vidx++) {
h[hidx].pid = task_pid_nr_ns(task, task_pidns);
hidx++;
task_pidns = task_pidns->parent;
}
if (task_pidns == root_pidns) {
tidx++;
vidx = 0;
}
}
rcu_read_unlock();
ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
if (ret < 0)
break;
hidx = 0;
} while (tidx < nr_tasks);
_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
return ret;
}
[...]
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 197 bytes
Desc: Digital signature
URL: <http://lists.openvz.org/pipermail/devel/attachments/20100322/22493c3b/attachment-0001.sig>
-------------- next part --------------
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list