[CRIU] [PATCH 1/2] dump: use freezer cgroup to seize processes
Andrew Vagin
avagin at gmail.com
Thu Jul 23 06:18:47 PDT 2015
On Thu, Jul 23, 2015 at 08:38:59AM -0400, Christopher Covington wrote:
> Hi Andrey,
>
> On 07/22/2015 12:07 PM, Andrey Vagin wrote:
> > Without using a freezer cgroup, we need to do a few iterations to catch
> > all tasks, because a new tasks can be born. If new tasks appear faster
> > than criu collects them, criu fails. The freezer cgroup allows to
> > solve this problem.
> >
> > We freeze the freezer group, then attache to tasks with ptrace and thaw
> > the freezer cgroup.
> >
> > We suppose that all tasks which are going to be dumped in a specified freezer
> > group.
>
> Neat. I was recently experimenting with the cgroup freezer, and while I was
> not working with the latest and greatest version of CRIU, it appeared that
> CRIU hung when trying to dump an already frozen process.
>
> Right now I'm stopping processes (SIGSTOP) at certain instruction counts, and
> then CRIU dumping them. But after reading about why the cgroup freezer was
> created, it seemed like using it would be better as the process trees I work
> with get more complex, potentially including bash with its funny SIGCONT
> handling and multiple processes that need to be stopped/frozen all at the
> exact same time.
>
> How much/what kind of work do you think is required to support dumping
> already-frozen cgroups?
This version allows to dump processes in an already-frozen group. CRIU
attaches to all processes and then unfreezes the groop, because we are
going to execute the parasite code in porcesses.
>
> I've made a few little code review comments below.
Thank you for comments! I will fix.
>
> Thanks,
> Chris
>
> > diff --git a/cr-dump.c b/cr-dump.c
> > index af1b281..1c05fdf 100644
> > --- a/cr-dump.c
> > +++ b/cr-dump.c
> > @@ -788,7 +788,9 @@ static int collect_task(struct pstree_item *item);
> > static int collect_children(struct pstree_item *item)
> > {
> > pid_t *ch;
> > - int ret, i, nr_children, nr_inprogress;
> > + int ret, i, nr_children, nr_inprogress, freeze;
> > +
> > + freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
> >
> > ret = parse_children(item->pid.real, &ch, &nr_children);
> > if (ret < 0)
> > @@ -813,7 +815,7 @@ static int collect_children(struct pstree_item *item)
> > goto free;
> > }
> >
> > - ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
> > + ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
> > if (ret < 0) {
> > /*
> > * Here is a race window between parse_children() and seize(),
> > @@ -929,7 +931,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
> > static int collect_threads(struct pstree_item *item)
> > {
> > struct pid *threads = NULL;
> > - int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
> > + int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
> > +
> > + freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
> >
> > ret = parse_threads(item->pid.real, &threads, &nr_threads);
> > if (ret < 0)
> > @@ -962,7 +966,7 @@ static int collect_threads(struct pstree_item *item)
> > pr_info("\tSeizing %d's %d thread\n",
> > item->pid.real, pid);
> >
> > - ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
> > + ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
> > if (ret < 0) {
> > /*
> > * Here is a race window between parse_threads() and seize(),
> > @@ -1006,6 +1010,9 @@ static int collect_loop(struct pstree_item *item,
> > {
> > int attempts = NR_ATTEMPTS, nr_inprogress = 1;
> >
> > + if (opts.freeze_cgroup)
> > + attempts = 2; /* double check that we skip nothing */
> > +
> > /*
> > * While we scan the proc and seize the children/threads
> > * new ones can appear (with clone(CLONE_PARENT) or with
> > @@ -1101,9 +1108,110 @@ int collect_pstree_ids(void)
> > return 0;
> > }
> >
> > +#define FREEZER_ATTEMPTS 10
> > +static int freeze_processes()
> > +{
> > + const char frozen[] = "FROZEN";
> > + const char thawed[] = "THAWED";
> > + int i, ret, fd, exit_code = -1;
> > + char path[PATH_MAX];
> > + bool last = false;
> > + FILE *f;
> > +
> > + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
> > + fd = open(path, O_RDWR);
> > + if (fd < 0) {
> > + pr_perror("Unable to open %s", path);
> > + return -1;
> > + }
> > + if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
> > + pr_perror("Unable to freeze tasks");
> > + close(fd);
> > + return -1;
> > + }
> > +
> > + /*
> > + * There is not way to wait a specified state, so we need to poll the
> > + * freezer.state
> > + */
> > + for (i = 0; i < FREEZER_ATTEMPTS; i++) {
> > + struct timespec req = {};
> > +
> > + snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
> > + f = fopen(path, "r");
> > + while (fgets(path, sizeof(path), f)) {
> > + pid_t pid;
> > +
> > + pid = atoi(path);
> > +
> > + ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
> > + if (ret == 0) /* skip already seized tasks */
> > + continue;
> > + if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE))
> > + goto err;
> > + }
> > + fclose(f);
> > +
> > + if (last)
> > + break;
> > +
> > + lseek(fd, 0, SEEK_SET);
> > + ret = read(fd, path, sizeof(path) - 1);
> > + if (ret <= 0) {
> > + pr_perror("Unable to get a current state");
> > + goto err;
> > + }
> > + if (path[ret - 1] == '\n')
> > + path[ret - 1] = 0;
> > + else
> > + path[ret] = 0;
> > +
> > + pr_debug("freezer.state=%s\n", path);
> > + if (strcmp(path, frozen) == 0) {
> > + /*
> > + * Enumirate all tasks one more time to collect all new
>
> Nit: Enumerate
>
> > + * tasks, which can be born while the cgroup are been freezing.
>
> Nit: is being frozen
>
> > + */
> > +
> > + last = true;
> > + continue;
> > + }
> > +
> > + pr_debug("freezer.state=%s instead of %s\n", path, frozen);
> > +
> > + req.tv_nsec = 10000000 * i;
> > + nanosleep(&req, NULL);
> > + }
> > +
> > + if (i == FREEZER_ATTEMPTS) {
> > + pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
> > + goto err;
> > + }
> > +
> > + exit_code = 0;
> > +err:
> > + lseek(fd, 0, SEEK_SET);
> > + if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
> > + pr_perror("Unable to thaw tasks");
> > + exit_code = -1;
> > + }
> > + if (close(fd)) {
> > + pr_perror("Unable to thaw tasks");
> > + return -1;
> > + }
> > +
> > + return exit_code;
> > +}
> > +
> > static int collect_pstree(pid_t pid)
> > {
> > - int ret;
> > + int ret, freeze = PTRACE_FREEZE;
> > +
> > + if (opts.freeze_cgroup) {
> > + if (freeze_processes())
> > + return -1;
> > + freeze = PTRACE_FREEZE_POST;
> > + }
> >
> > timing_start(TIME_FREEZING);
> >
> > @@ -1112,7 +1220,7 @@ static int collect_pstree(pid_t pid)
> > return -1;
> >
> > root_item->pid.real = pid;
> > - ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds);
> > + ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds, freeze);
> > if (ret < 0)
> > goto err;
> > pr_info("Seized task %d, state %d\n", pid, ret);
> > diff --git a/cr-exec.c b/cr-exec.c
> > index f3d55f6..b149c38 100644
> > --- a/cr-exec.c
> > +++ b/cr-exec.c
> > @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
> > goto out;
> > }
> >
> > - prev_state = ret = seize_task(pid, -1, &creds);
> > + prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
> > if (ret < 0) {
> > pr_err("Can't seize task %d\n", pid);
> > goto out;
> > diff --git a/crtools.c b/crtools.c
> > index b085d33..f274348 100644
> > --- a/crtools.c
> > +++ b/crtools.c
> > @@ -234,6 +234,7 @@ int main(int argc, char *argv[], char *envp[])
> > { "enable-fs", required_argument, 0, 1065 },
> > { "enable-external-sharing", no_argument, 0, 1066 },
> > { "enable-external-masters", no_argument, 0, 1067 },
> > + { "freeze-cgroup", required_argument, 0, 1068 },
> > { },
> > };
> >
> > @@ -462,6 +463,9 @@ int main(int argc, char *argv[], char *envp[])
> > case 1067:
> > opts.enable_external_masters = true;
> > break;
> > + case 1068:
> > + opts.freeze_cgroup = optarg;
> > + break;
> > case 'M':
> > {
> > char *aux;
> > @@ -673,6 +677,8 @@ usage:
> > " 'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
> > " --exec-cmd execute the command specified after '--' on successful\n"
> > " restore making it the parent of the restored process\n"
> > +" --freeze-cgroup\n"
> > +" use freeze cgroup to collect processes\n"
>
> Nit: cgroup freezer
>
> > "\n"
> > "* Special resources support:\n"
> > " -x|--" USK_EXT_PARAM " allow external unix connections\n"
> > diff --git a/include/cr_options.h b/include/cr_options.h
> > index 9ab8bba..09b0794 100644
> > --- a/include/cr_options.h
> > +++ b/include/cr_options.h
> > @@ -56,6 +56,7 @@ struct cr_options {
> > char *output;
> > char *root;
> > char *pidfile;
> > + char *freeze_cgroup;
> > struct list_head veth_pairs;
> > struct list_head scripts;
> > struct list_head ext_mounts;
> > diff --git a/include/ptrace.h b/include/ptrace.h
> > index 44b66c9..568e51e 100644
> > --- a/include/ptrace.h
> > +++ b/include/ptrace.h
> > @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
> >
> > #define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
> >
> > -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
> > +#define PTRACE_FREEZE 0
> > +#define PTRACE_FREEZE_PRE 1
> > +#define PTRACE_FREEZE_POST 2
>
> I don't understand what these mean in my first read through. Do you think you
> could briefly document them, either in the source or commit message?
>
> > +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
> > extern int suspend_seccomp(pid_t pid);
> > extern int unseize_task(pid_t pid, int orig_state, int state);
> > extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
> > diff --git a/proc_parse.c b/proc_parse.c
> > index ed78d70..251cf3b 100644
> > --- a/proc_parse.c
> > +++ b/proc_parse.c
> > @@ -1162,6 +1162,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
> >
> > new->nsid = nsid;
> >
> > + pr_debug("%s\n", str);
> > ret = parse_mountinfo_ent(str, new, &fsname);
> > if (ret < 0) {
> > pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
> > diff --git a/ptrace.c b/ptrace.c
> > index 905eaec..dc96cfa 100644
> > --- a/ptrace.c
> > +++ b/ptrace.c
> > @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
> > * up with someone else.
> > */
> >
> > -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> > +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
> > {
> > siginfo_t si;
> > int status;
> > - int ret, ret2, ptrace_errno, wait_errno = 0;
> > + int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
> > struct proc_status_creds cr;
> >
> > /*
> > @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> > */
> > memzero(&cr, sizeof(struct proc_status_creds));
> >
> > - ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> > - ptrace_errno = errno;
> > - if (ret == 0) {
> > - /*
> > - * If we SEIZE-d the task stop it before going
> > - * and reading its stat from proc. Otherwise task
> > - * may die _while_ we're doing it and we'll have
> > - * inconsistent seize/state pair.
> > - *
> > - * If task dies after we seize it but before we
> > - * do this interrupt, we'll notice it via proc.
> > - */
> > - ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> > - if (ret < 0) {
> > - pr_perror("SEIZE %d: can't interrupt task", pid);
> > - ptrace(PTRACE_DETACH, pid, NULL, NULL);
> > - goto err;
> > + if (freeze != 2) {
>
> PTRACE_FREEZE_POST ?
>
> > + ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> > + ptrace_errno = errno;
> > + if (ret == 0) {
> > + /*
> > + * If we SEIZE-d the task stop it before going
> > + * and reading its stat from proc. Otherwise task
> > + * may die _while_ we're doing it and we'll have
> > + * inconsistent seize/state pair.
> > + *
> > + * If task dies after we seize it but before we
> > + * do this interrupt, we'll notice it via proc.
> > + */
> > + ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> > + if (ret < 0) {
> > + pr_perror("SEIZE %d: can't interrupt task", pid);
> > + ptrace(PTRACE_DETACH, pid, NULL, NULL);
> > + goto err;
> > + }
> > }
> > }
> >
> > @@ -101,7 +103,16 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> >
> > try_again:
> > if (!ret) {
> > - ret = wait4(pid, &status, __WALL, NULL);
> > + int flags = 0;
> > +
> > + if (freeze == 1)
>
> PTRACE_FREEZE_PRE ?
>
> > + flags |= WNOHANG;
> > +
> > + ret = wait4(pid, &status, __WALL | flags, NULL);
> > + if (freeze == 1 && ret > 0) {
>
> PTRACE_FREEZE_PRE ?
>
> > + pr_err("The %d task is not frozen\n", pid);
> > + goto err;
> > + }
> > wait_errno = errno;
> > }
> >
> > @@ -126,6 +137,8 @@ try_again:
> >
> > return TASK_DEAD;
> > }
> > + if (freeze == 1)
>
> PTRACE_FREEZE_PRE ?
>
> > + return 0;
> >
> > if ((ppid != -1) && (cr.ppid != ppid)) {
> > pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
> >
>
>
> --
> Qualcomm Innovation Center, Inc.
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
More information about the CRIU
mailing list