[Devel] Re: [RFC v12][PATCH 14/14] Restart multiple processes
Serge E. Hallyn
serue at us.ibm.com
Sun Jan 4 12:19:57 PST 2009
Quoting Oren Laadan (orenl at cs.columbia.edu):
> Restarting of multiple processes expects all restarting tasks to call
> sys_restart(). Once inside the system call, each task will restart
> itself at the same order that they were saved. The internals of the
> syscall will take care of in-kernel synchronization bewteen tasks.
>
> This patch does _not_ create the task tree in the kernel. Instead it
> assumes that all tasks are created in some way and then invoke the
> restart syscall. You can use the userspace mktree.c program to do
> that.
>
> The init task (*) has a special role: it allocates the restart context
> (ctx), and coordinates the operation. In particular, it first waits
> until all participating tasks enter the kernel, and provides them the
> common restart context. Once everyone in ready, it begins to restart
> itself.
>
> In contrast, the other tasks enter the kernel, locate the init task (*)
> and grab its restart context, and then wait for their turn to restore.
>
> When a task (init or not) completes its restart, it hands the control
> over to the next in line, by waking that task.
>
> An array of pids (the one saved during the checkpoint) is used to
> synchronize the operation. The first task in the array is the init
> task (*). The restart context (ctx) maintain a "current position" in
> the array, which indicates which task is currently active. Once the
> currently active task completes its own restart, it increments that
> position and wakes up the next task.
>
> Restart assumes that userspace provides meaningful data, otherwise
> it's garbage-in-garbage-out. In this case, the syscall may block
> indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
> otherwise kill the stray restarting tasks.
>
> In terms of security, restart runs as the user the invokes it, so it
> will not allow a user to do more than is otherwise permitted by the
> usual system semantics and policy.
>
> Currently we ignore threads and zombies, as well as session ids.
> Add support for multiple processes
>
> (*) For containers, restart should be called inside a fresh container
> by the init task of that container. However, it is also possible to
> restart applications not necessarily inside a container, and without
> restoring the original pids of the processes (that is, provided that
> the application can tolerate such behavior). This is useful to allow
> multi-process restart of tasks not isolated inside a container, and
> also for debugging.
>
> Changelog[v12]:
> - Replace obsolete cr_debug() with pr_debug()
>
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
Thanks, Oren.
Acked-by: Serge Hallyn <serue at us.ibm.com>
with a few comments below.
> ---
> checkpoint/restart.c | 214 +++++++++++++++++++++++++++++++++++++++++++-
> checkpoint/sys.c | 34 ++++++--
> include/linux/checkpoint.h | 23 ++++-
> include/linux/sched.h | 1 +
> 4 files changed, 258 insertions(+), 14 deletions(-)
>
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 0c46abf..6b4cd75 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -10,6 +10,7 @@
>
> #include <linux/version.h>
> #include <linux/sched.h>
> +#include <linux/wait.h>
> #include <linux/file.h>
> #include <linux/magic.h>
> #include <linux/checkpoint.h>
> @@ -276,30 +277,235 @@ static int cr_read_task(struct cr_ctx *ctx)
> return ret;
> }
>
> +/* cr_read_tree - read the tasks tree into the checkpoint context */
> +static int cr_read_tree(struct cr_ctx *ctx)
> +{
> + struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + int parent, size, ret = -EINVAL;
> +
> + parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
> + if (parent < 0) {
> + ret = parent;
> + goto out;
> + } else if (parent != 0)
> + goto out;
> +
> + if (hh->tasks_nr < 0)
> + goto out;
> +
> + ctx->pids_nr = hh->tasks_nr;
> + size = sizeof(*ctx->pids_arr) * ctx->pids_nr;
> + if (size < 0) /* overflow ? */
> + goto out;
> +
> + ctx->pids_arr = kmalloc(size, GFP_KERNEL);
> + if (!ctx->pids_arr) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + ret = cr_kread(ctx, ctx->pids_arr, size);
> + out:
> + cr_hbuf_put(ctx, sizeof(*hh));
> + return ret;
> +}
> +
> +static int cr_wait_task(struct cr_ctx *ctx)
> +{
> + pid_t pid = task_pid_vnr(current);
> +
> + pr_debug("pid %d waiting\n", pid);
> + return wait_event_interruptible(ctx->waitq, ctx->pids_active == pid);
> +}
> +
> +static int cr_next_task(struct cr_ctx *ctx)
> +{
> + struct task_struct *tsk;
> +
> + ctx->pids_pos++;
> +
> + pr_debug("pids_pos %d %d\n", ctx->pids_pos, ctx->pids_nr);
> + if (ctx->pids_pos == ctx->pids_nr) {
> + complete(&ctx->complete);
> + return 0;
> + }
> +
> + ctx->pids_active = ctx->pids_arr[ctx->pids_pos].vpid;
> +
> + pr_debug("pids_next %d\n", ctx->pids_active);
> +
> + rcu_read_lock();
> + tsk = find_task_by_pid_ns(ctx->pids_active, ctx->root_nsproxy->pid_ns);
> + if (tsk)
> + wake_up_process(tsk);
> + rcu_read_unlock();
> +
> + if (!tsk) {
> + ctx->pids_err = -ESRCH;
> + complete(&ctx->complete);
> + return -ESRCH;
> + }
> +
> + return 0;
> +}
> +
> +/* FIXME: this should be per container */
> +DECLARE_WAIT_QUEUE_HEAD(cr_restart_waitq);
> +
> +static int do_restart_task(struct cr_ctx *ctx, pid_t pid)
Passing ctx in here, when it is always NULL, is just a bit
confusing, and, since you goto out and cr_ctx_put(ctx) without
initializing ctx, you make verifying that that's ok a step
harder.
Do you intend to ever pass in non-NULL in later patches?
> +{
> + struct task_struct *root_task;
> + int ret;
> +
> + rcu_read_lock();
> + root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
> + if (root_task)
> + get_task_struct(root_task);
> + rcu_read_unlock();
> +
> + if (!root_task)
> + return -EINVAL;
> +
> + /*
> + * wait for container init to initialize the restart context, then
> + * grab a reference to that context, and if we're the last task to
> + * do it, notify the container init.
> + */
> + ret = wait_event_interruptible(cr_restart_waitq,
> + root_task->checkpoint_ctx);
Would seem more sensible to do the above using the ctx which
you grabbed under task_lock(root_task) next. Your whole
locking of root_task->checkpoint_ctx seems haphazard (see below).
> + if (ret < 0)
> + goto out;
> +
> + task_lock(root_task);
> + ctx = root_task->checkpoint_ctx;
> + if (ctx)
> + cr_ctx_get(ctx);
> + task_unlock(root_task);
> +
> + if (!ctx) {
> + ret = -EAGAIN;
> + goto out;
> + }
> +
> + if (atomic_dec_and_test(&ctx->tasks_count))
> + complete(&ctx->complete);
> +
> + /* wait for our turn, do the restore, and tell next task in line */
> + ret = cr_wait_task(ctx);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_task(ctx);
> + if (ret < 0)
> + goto out;
> + ret = cr_next_task(ctx);
> +
> + out:
> + cr_ctx_put(ctx);
> + put_task_struct(root_task);
> + return ret;
> +}
> +
> +static int cr_wait_all_tasks_start(struct cr_ctx *ctx)
> +{
> + int ret;
> +
> + if (ctx->pids_nr == 1)
> + return 0;
> +
> + init_completion(&ctx->complete);
> + current->checkpoint_ctx = ctx;
> +
> + wake_up_all(&cr_restart_waitq);
> +
> + ret = wait_for_completion_interruptible(&ctx->complete);
> + if (ret < 0)
> + return ret;
> +
> + task_lock(current);
> + current->checkpoint_ctx = NULL;
> + task_unlock(current);
Who can you be racing with here? All other tasks should have
already dereferenced current->checkpoint_ctx.
If you want to always lock root_task around setting and
reading of (root_task|current)->checkpoint_ctx, that's
ok, but I think you can sufficiently argue that your
completion is completely protecint readers from the sole
writer.
> +
> + return 0;
> +}
> +
> +static int cr_wait_all_tasks_finish(struct cr_ctx *ctx)
> +{
> + int ret;
> +
> + if (ctx->pids_nr == 1)
> + return 0;
> +
> + init_completion(&ctx->complete);
> +
> + ret = cr_next_task(ctx);
> + if (ret < 0)
> + return ret;
> +
> + ret = wait_for_completion_interruptible(&ctx->complete);
> + if (ret < 0)
> + return ret;
> +
> + return 0;
> +}
> +
> /* setup restart-specific parts of ctx */
> static int cr_ctx_restart(struct cr_ctx *ctx, pid_t pid)
> {
> + ctx->root_pid = pid;
> + ctx->root_task = current;
> + ctx->root_nsproxy = current->nsproxy;
> +
> + get_task_struct(ctx->root_task);
> + get_nsproxy(ctx->root_nsproxy);
> +
> + atomic_set(&ctx->tasks_count, ctx->pids_nr - 1);
> +
> return 0;
> }
>
> -int do_restart(struct cr_ctx *ctx, pid_t pid)
> +static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
> {
> int ret;
>
> + ret = cr_read_head(ctx);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_tree(ctx);
> + if (ret < 0)
> + goto out;
> +
> ret = cr_ctx_restart(ctx, pid);
> if (ret < 0)
> goto out;
> - ret = cr_read_head(ctx);
> +
> + /* wait for all other tasks to enter do_restart_task() */
> + ret = cr_wait_all_tasks_start(ctx);
> if (ret < 0)
> goto out;
> +
> ret = cr_read_task(ctx);
> if (ret < 0)
> goto out;
> - ret = cr_read_tail(ctx);
> +
> + /* wait for all other tasks to complete do_restart_task() */
> + ret = cr_wait_all_tasks_finish(ctx);
> if (ret < 0)
> goto out;
>
> - /* on success, adjust the return value if needed [TODO] */
> + ret = cr_read_tail(ctx);
> +
> out:
> return ret;
> }
> +
> +int do_restart(struct cr_ctx *ctx, pid_t pid)
> +{
> + int ret;
> +
> + if (ctx)
> + ret = do_restart_root(ctx, pid);
> + else
> + ret = do_restart_task(ctx, pid);
> +
> + /* on success, adjust the return value if needed [TODO] */
> + return ret;
> +}
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 0436ef3..f26b0c6 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -167,6 +167,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
>
> static void cr_ctx_free(struct cr_ctx *ctx)
> {
> + BUG_ON(atomic_read(&ctx->refcount));
> +
> if (ctx->file)
> fput(ctx->file);
>
> @@ -185,6 +187,8 @@ static void cr_ctx_free(struct cr_ctx *ctx)
> if (ctx->root_task)
> put_task_struct(ctx->root_task);
>
> + kfree(ctx->pids_arr);
> +
> kfree(ctx);
> }
>
> @@ -199,8 +203,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
>
> ctx->flags = flags;
>
> + atomic_set(&ctx->refcount, 0);
> INIT_LIST_HEAD(&ctx->pgarr_list);
> INIT_LIST_HEAD(&ctx->pgarr_pool);
> + init_waitqueue_head(&ctx->waitq);
>
> err = -EBADF;
> ctx->file = fget(fd);
> @@ -215,6 +221,7 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
> if (cr_objhash_alloc(ctx) < 0)
> goto err;
>
> + atomic_inc(&ctx->refcount);
> return ctx;
>
> err:
> @@ -222,6 +229,17 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
> return ERR_PTR(err);
> }
>
> +void cr_ctx_get(struct cr_ctx *ctx)
> +{
> + atomic_inc(&ctx->refcount);
> +}
> +
> +void cr_ctx_put(struct cr_ctx *ctx)
> +{
> + if (ctx && atomic_dec_and_test(&ctx->refcount))
> + cr_ctx_free(ctx);
> +}
> +
> /**
> * sys_checkpoint - checkpoint a container
> * @pid: pid of the container init(1) process
> @@ -249,7 +267,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
> if (!ret)
> ret = ctx->crid;
>
> - cr_ctx_free(ctx);
> + cr_ctx_put(ctx);
> return ret;
> }
>
> @@ -264,7 +282,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
> */
> asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
> {
> - struct cr_ctx *ctx;
> + struct cr_ctx *ctx = NULL;
> pid_t pid;
> int ret;
>
> @@ -272,15 +290,17 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
> if (flags)
> return -EINVAL;
>
> - ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
> - if (IS_ERR(ctx))
> - return PTR_ERR(ctx);
> -
> /* FIXME: for now, we use 'crid' as a pid */
> pid = (pid_t) crid;
>
> + if (pid == task_pid_vnr(current))
> + ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
> +
> + if (IS_ERR(ctx))
> + return PTR_ERR(ctx);
> +
> ret = do_restart(ctx, pid);
>
> - cr_ctx_free(ctx);
> + cr_ctx_put(ctx);
> return ret;
> }
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 86fcec9..041bd4d 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -13,10 +13,11 @@
> #include <linux/fs.h>
> #include <linux/path.h>
> #include <linux/sched.h>
> +#include <asm/atomic.h>
>
> #ifdef CONFIG_CHECKPOINT_RESTART
>
> -#define CR_VERSION 2
> +#define CR_VERSION 3
>
> struct cr_ctx {
> int crid; /* unique checkpoint id */
> @@ -34,8 +35,7 @@ struct cr_ctx {
> void *hbuf; /* temporary buffer for headers */
> int hpos; /* position in headers buffer */
>
> - struct task_struct **tasks_arr; /* array of all tasks in container */
> - int tasks_nr; /* size of tasks array */
> + atomic_t refcount;
>
> struct cr_objhash *objhash; /* hash for shared objects */
>
> @@ -43,6 +43,20 @@ struct cr_ctx {
> struct list_head pgarr_pool; /* pool of empty page arrays chain */
>
> struct path fs_mnt; /* container root (FIXME) */
> +
> + /* [multi-process checkpoint] */
> + struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
> + int tasks_nr; /* size of tasks array */
> +
> + /* [multi-process restart] */
> + struct cr_hdr_pids *pids_arr; /* array of all pids [restart] */
> + int pids_nr; /* size of pids array */
> + int pids_pos; /* position pids array */
> + int pids_err; /* error occured ? */
> + pid_t pids_active; /* pid of (next) active task */
> + atomic_t tasks_count; /* sync of restarting tasks */
'sync of restarting tasks' for 3 vars suggests one or two or three
might benefit from more detailed comment :)
> + struct completion complete; /* sync of restarting tasks */
> + wait_queue_head_t waitq; /* sync of restarting tasks */
> };
>
> /* cr_ctx: flags */
> @@ -55,6 +69,9 @@ extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
> extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
> extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
>
> +extern void cr_ctx_get(struct cr_ctx *ctx);
> +extern void cr_ctx_put(struct cr_ctx *ctx);
> +
> /* shared objects handling */
>
> enum {
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index faa2ec6..0150e90 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1359,6 +1359,7 @@ struct task_struct {
>
> #ifdef CONFIG_CHECKPOINT_RESTART
> atomic_t may_checkpoint;
> + struct cr_ctx *checkpoint_ctx;
> #endif
> };
>
> --
> 1.5.4.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list