[CRIU] [PATCH 7/7] restore: handle the case where zombies are reparented
Tycho Andersen
tycho.andersen at canonical.com
Tue Jun 28 08:04:08 PDT 2016
On Tue, Jun 28, 2016 at 04:00:50PM +0300, Pavel Emelyanov wrote:
> On 06/23/2016 06:13 PM, Tycho Andersen wrote:
> > For example, if a zombie has a helper that sets up its session id, the
> > zombie will be reparented to the init task, which will then potentially get
> > a SIGCHLD for a task which isn't its direct child zombie, which we didn't
> > handle. Instead, let's recursively find all the zombies for the init task,
> > in case they get reparented this way.
>
> This is checked by the test in patch #3, right?
Yep.
> > This commit is a little ugly: we are now passing three lists of pids
> > (helpers, zombies, and things that are allowed to die) to the restorer blob
> > for each task. We do need to have some way to distinguish, because:
> >
> > * we want to wait on helpers
> > * we don't want to wait on zombies (i.e. we want to keep the pid dead), but
> > confirm that it actually died
> > * we don't want to wait on anything that gets reparented, but we don't want
> > to error out if we get a SIGCHLD for it
> >
> > We could introduce a new struct:
> >
> > struct pid_info {
> > pid_t pid;
> > bool zombie;
> > bool direct_child;
> > }
> >
> > to handle this instead of the three separate lists. I'm not sure which is
> > cleaner, but I'd be happy to refactor to the other way if that's better.
>
> What if we count these non-ours zombies into existing zombies? The only difference
> from this patch I see is that the wait_zombies() would call wait() on it. Is it bad?
I thought I tried this and ran into some problem, but I don't remember
what it was. I might have done this before I had the other patch about
calculating the number of zombies, which probably would have caused
problems. Anyway, I'll revisit this and figure it out for the next
series.
> > Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> > ---
> > criu/cr-restore.c | 31 ++++++++++++++++++++++++++-----
> > criu/include/restorer.h | 3 +++
> > criu/pie/restorer.c | 20 ++++++--------------
> > 3 files changed, 35 insertions(+), 19 deletions(-)
> >
> > diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> > index 771ea50..9a655c8 100644
> > --- a/criu/cr-restore.c
> > +++ b/criu/cr-restore.c
> > @@ -362,14 +362,16 @@ err:
> > return ret;
> > }
> >
> > -static int collect_child_pids(int state, unsigned int *n)
> > +static int collect_child_pids(struct pstree_item *root, int state, unsigned int *n, bool recurse)
> > {
> > struct pstree_item *pi;
> >
> > - *n = 0;
> > - list_for_each_entry(pi, ¤t->children, sibling) {
> > + list_for_each_entry(pi, &root->children, sibling) {
> > pid_t *child;
> >
> > + if (recurse && collect_child_pids(pi, state, n, recurse) < 0)
> > + return -1;
> > +
> > if (pi->pid.state != state)
> > continue;
> >
> > @@ -387,13 +389,28 @@ static int collect_child_pids(int state, unsigned int *n)
> > static int collect_helper_pids(struct task_restore_args *ta)
> > {
> > ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE);
> > - return collect_child_pids(TASK_HELPER, &ta->helpers_n);
> > + ta->helpers_n = 0;
> > + return collect_child_pids(current, TASK_HELPER, &ta->helpers_n, false);
> > }
> >
> > static int collect_zombie_pids(struct task_restore_args *ta)
> > {
> > ta->zombies = (pid_t *)rst_mem_align_cpos(RM_PRIVATE);
> > - return collect_child_pids(TASK_DEAD, &ta->zombies_n);
> > + ta->zombies_n = 0;
> > + return collect_child_pids(current, TASK_DEAD, &ta->zombies_n, false);
> > +}
> > +
> > +static int collect_allowed_to_die_pids(struct task_restore_args *ta)
> > +{
> > + ta->allowed_to_die = (pid_t *)rst_mem_align_cpos(RM_PRIVATE);
> > + ta->allowed_to_die_n = 0;
> > +
> > + if (collect_child_pids(current, TASK_DEAD, &ta->allowed_to_die_n, current == root_item) < 0)
> > + return -1;
> > + if (collect_child_pids(current, TASK_HELPER, &ta->allowed_to_die_n, current == root_item) < 0)
> > + return -1;
> > +
> > + return 0;
> > }
> >
> > static int open_core(int pid, CoreEntry **pcore)
> > @@ -530,6 +547,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
> > if (collect_zombie_pids(ta) < 0)
> > return -1;
> >
> > + if (collect_allowed_to_die_pids(ta) < 0)
> > + return -1;
> > +
> > if (inherit_fd_fini() < 0)
> > return -1;
> >
> > @@ -2867,6 +2887,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> > RST_MEM_FIXUP_PPTR(task_args->helpers);
> > RST_MEM_FIXUP_PPTR(task_args->zombies);
> > RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
> > + RST_MEM_FIXUP_PPTR(task_args->allowed_to_die);
> >
> > if (core->tc->has_seccomp_mode)
> > task_args->seccomp_mode = core->tc->seccomp_mode;
> > diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> > index fad9e09..004ea9e 100644
> > --- a/criu/include/restorer.h
> > +++ b/criu/include/restorer.h
> > @@ -149,6 +149,9 @@ struct task_restore_args {
> > struct sock_fprog *seccomp_filters;
> > unsigned int seccomp_filters_n;
> >
> > + pid_t *allowed_to_die;
> > + unsigned int allowed_to_die_n;
> > +
> > /* * * * * * * * * * * * * * * * * * * * */
> >
> > unsigned long task_size;
> > diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> > index 2578f77..2031f9c 100644
> > --- a/criu/pie/restorer.c
> > +++ b/criu/pie/restorer.c
> > @@ -56,10 +56,8 @@
> >
> > static struct task_entries *task_entries;
> > static futex_t thread_inprogress;
> > -static pid_t *helpers;
> > -static int n_helpers;
> > -static pid_t *zombies;
> > -static int n_zombies;
> > +static pid_t *allowed_to_die;
> > +static int n_allowed_to_die;
> >
> > extern void cr_restore_rt (void) asm ("__cr_restore_rt")
> > __attribute__ ((visibility ("hidden")));
> > @@ -71,12 +69,8 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
> >
> > /* We can ignore helpers that die, we expect them to after
> > * CR_STATE_RESTORE is finished. */
> > - for (i = 0; i < n_helpers; i++)
> > - if (siginfo->si_pid == helpers[i])
> > - return;
> > -
> > - for (i = 0; i < n_zombies; i++)
> > - if (siginfo->si_pid == zombies[i])
> > + for (i = 0; i < n_allowed_to_die; i++)
> > + if (siginfo->si_pid == allowed_to_die[i])
> > return;
> >
> > if (siginfo->si_code & CLD_EXITED)
> > @@ -1083,10 +1077,8 @@ long __export_restore_task(struct task_restore_args *args)
> > #endif
> >
> > task_entries = args->task_entries;
> > - helpers = args->helpers;
> > - n_helpers = args->helpers_n;
> > - zombies = args->zombies;
> > - n_zombies = args->zombies_n;
> > + allowed_to_die = args->allowed_to_die;
> > + n_allowed_to_die = args->allowed_to_die_n;
> > *args->breakpoint = rst_sigreturn;
> >
> > ksigfillset(&act.rt_sa_mask);
> >
>
More information about the CRIU
mailing list