[CRIU] [PATCH 3/3] dump: use freezer cgroup to seize processes (v2)
Christopher Covington
cov at codeaurora.org
Tue Aug 4 06:26:36 PDT 2015
On 08/04/2015 08:06 AM, Andrew Vagin wrote:
> On Tue, Aug 04, 2015 at 07:50:46AM -0400, Christopher Covington wrote:
>> On 08/03/2015 10:18 PM, Andrey Vagin wrote:
>>> Without using a freezer cgroup, we need to do a few iterations to catch
>>> all tasks, because a new tasks can be born. If new tasks appear faster
>>> than criu collects them, criu fails. The freezer cgroup allows to
>>> solve this problem.
>>>
>>> We freeze the freezer group, then attaches to tasks with ptrace and thaw
>>> the freezer cgroup. We suppose that all tasks which are going to be
>>> dumped in a specified freezer group.
>>>
>>> v2: fix comments from Christopher
>>>
>>> Cc: Christopher Covington <cov at codeaurora.org>
>>> Signed-off-by: Andrey Vagin <avagin at openvz.org>
>>> ---
>>> cr-exec.c | 2 +-
>>> crtools.c | 6 ++
>>> include/cr_options.h | 1 +
>>> include/ptrace.h | 5 +-
>>> proc_parse.c | 1 +
>>> ptrace.c | 54 +++++++++------
>>> seize.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++--
>>> 7 files changed, 226 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/cr-exec.c b/cr-exec.c
>>> index f3d55f6..b149c38 100644
>>> --- a/cr-exec.c
>>> +++ b/cr-exec.c
>>> @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
>>> goto out;
>>> }
>>>
>>> - prev_state = ret = seize_task(pid, -1, &creds);
>>> + prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
>>> if (ret < 0) {
>>> pr_err("Can't seize task %d\n", pid);
>>> goto out;
>>> diff --git a/crtools.c b/crtools.c
>>> index 6af6080..9096420 100644
>>> --- a/crtools.c
>>> +++ b/crtools.c
>>> @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>>> { "enable-fs", required_argument, 0, 1065 },
>>> { "enable-external-sharing", no_argument, 0, 1066 },
>>> { "enable-external-masters", no_argument, 0, 1067 },
>>> + { "freeze-cgroup", required_argument, 0, 1068 },
>>> { },
>>> };
>>>
>>> @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>>> case 1067:
>>> opts.enable_external_masters = true;
>>> break;
>>> + case 1068:
>>> + opts.freeze_cgroup = optarg;
>>> + break;
>>> case 'M':
>>> {
>>> char *aux;
>>> @@ -676,6 +680,8 @@ usage:
>>> " 'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
>>> " --exec-cmd execute the command specified after '--' on successful\n"
>>> " restore making it the parent of the restored process\n"
>>> +" --freeze-cgroup\n"
>>> +" use cgroup freezer to collect processes\n"
>>> "\n"
>>> "* Special resources support:\n"
>>> " -x|--" USK_EXT_PARAM "inode,.." " allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
>>> diff --git a/include/cr_options.h b/include/cr_options.h
>>> index 19c2f77..f981806 100644
>>> --- a/include/cr_options.h
>>> +++ b/include/cr_options.h
>>> @@ -57,6 +57,7 @@ struct cr_options {
>>> char *output;
>>> char *root;
>>> char *pidfile;
>>> + char *freeze_cgroup;
>>> struct list_head veth_pairs;
>>> struct list_head scripts;
>>> struct list_head ext_mounts;
>>> diff --git a/include/ptrace.h b/include/ptrace.h
>>> index 44b66c9..ce91a45 100644
>>> --- a/include/ptrace.h
>>> +++ b/include/ptrace.h
>>> @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
>>>
>>> #define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
>>>
>>> -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
>>> +#define PTRACE_FREEZE_PRE 0 /* Attach to the process (PTRACE_SEIZE and PTRACE_INTERRUPT) */
>>> +#define PTRACE_FREEZE_POST 1 /* Wait when the process will be stopped */
>>> +#define PTRACE_FREEZE 2 /* Do both stages together */
>>> +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
>>> extern int suspend_seccomp(pid_t pid);
>>> extern int unseize_task(pid_t pid, int orig_state, int state);
>>> extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
>>> diff --git a/proc_parse.c b/proc_parse.c
>>> index 95d6505..ea98a97 100644
>>> --- a/proc_parse.c
>>> +++ b/proc_parse.c
>>> @@ -1135,6 +1135,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
>>>
>>> new->nsid = nsid;
>>>
>>> + pr_debug("%s\n", str);
>>> ret = parse_mountinfo_ent(str, new, &fsname);
>>> if (ret < 0) {
>>> pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
>>> diff --git a/ptrace.c b/ptrace.c
>>> index 905eaec..e614be4 100644
>>> --- a/ptrace.c
>>> +++ b/ptrace.c
>>> @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
>>> * up with someone else.
>>> */
>>>
>>> -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>> +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
>>> {
>>> siginfo_t si;
>>> int status;
>>> - int ret, ret2, ptrace_errno, wait_errno = 0;
>>> + int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
>>> struct proc_status_creds cr;
>>>
>>> /*
>>> @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>> */
>>> memzero(&cr, sizeof(struct proc_status_creds));
>>>
>>> - ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
>>> - ptrace_errno = errno;
>>> - if (ret == 0) {
>>> - /*
>>> - * If we SEIZE-d the task stop it before going
>>> - * and reading its stat from proc. Otherwise task
>>> - * may die _while_ we're doing it and we'll have
>>> - * inconsistent seize/state pair.
>>> - *
>>> - * If task dies after we seize it but before we
>>> - * do this interrupt, we'll notice it via proc.
>>> - */
>>> - ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
>>> - if (ret < 0) {
>>> - pr_perror("SEIZE %d: can't interrupt task", pid);
>>> - ptrace(PTRACE_DETACH, pid, NULL, NULL);
>>> - goto err;
>>> + if (freeze != PTRACE_FREEZE_POST) {
>>> + ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
>>> + ptrace_errno = errno;
>>> + if (ret == 0) {
>>> + /*
>>> + * If we SEIZE-d the task stop it before going
>>> + * and reading its stat from proc. Otherwise task
>>> + * may die _while_ we're doing it and we'll have
>>> + * inconsistent seize/state pair.
>>> + *
>>> + * If task dies after we seize it but before we
>>> + * do this interrupt, we'll notice it via proc.
>>> + */
>>> + ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
>>> + if (ret < 0) {
>>> + pr_perror("SEIZE %d: can't interrupt task", pid);
>>> + ptrace(PTRACE_DETACH, pid, NULL, NULL);
>>> + goto err;
>>> + }
>>> }
>>> }
>>>
>>> @@ -101,8 +103,18 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>>
>>> try_again:
>>> if (!ret) {
>>> - ret = wait4(pid, &status, __WALL, NULL);
>>> + int flags = 0;
>>> +
>>> + if (freeze == PTRACE_FREEZE_PRE)
>>> + flags |= WNOHANG;
>>> +
>>> + ret = wait4(pid, &status, __WALL | flags, NULL);
>>> wait_errno = errno;
>>> +
>>> + if (freeze == PTRACE_FREEZE_PRE && ret > 0) {
>>> + pr_err("The %d task is not frozen\n", pid);
>>> + goto err;
>>> + }
>>> }
>>>
>>> ret2 = parse_pid_status(pid, &cr);
>>> @@ -126,6 +138,8 @@ try_again:
>>>
>>> return TASK_DEAD;
>>> }
>>> + if (freeze == PTRACE_FREEZE_PRE)
>>> + return 0;
>>>
>>> if ((ppid != -1) && (cr.ppid != ppid)) {
>>> pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
>>> diff --git a/seize.c b/seize.c
>>> index e9be332..6ed664f 100644
>>> --- a/seize.c
>>> +++ b/seize.c
>>> @@ -18,6 +18,163 @@
>>>
>>> #define NR_ATTEMPTS 5
>>
>> There appears to be an identical definition in cr-dump.c. Perhaps they can be
>> consolidated into a single definition in a common header?
>
> I moved a part of cr-dump.c to seize.c in the first patch.
>
>>
>>> +const char frozen[] = "FROZEN";
>>> +const char freezing[] = "FREEZING";
>>> +const char thawed[] = "THAWED";
>>> +
>>> +static const char *get_freezer_state(int fd)
>>> +{
>>> + int ret;
>>> + char path[PATH_MAX];
>>> +
>>> + lseek(fd, 0, SEEK_SET);
>>> + ret = read(fd, path, sizeof(path) - 1);
>>> + if (ret <= 0) {
>>
>> I don't know if this could happen on cgroupsfs:
>> should you loop if 0 < ret < sizeof(path) - 1?
>
> Currently, it isn't required. And I suppose that this code will always
> work. But theoretically you are right.
>
>
>>
>>> + pr_perror("Unable to get a current state");
>>> + goto err;
>>> + }
>>> + if (path[ret - 1] == '\n')
>>> + path[ret - 1] = 0;
>>> + else
>>> + path[ret] = 0;
>>> +
>>> + pr_debug("freezer.state=%s\n", path);
>>> + if (strcmp(path, frozen) == 0)
>>> + return frozen;
>>> + if (strcmp(path, freezing) == 0)
>>> + return freezing;
>>> + if (strcmp(path, thawed) == 0)
>>> + return thawed;
>>> +
>>> + pr_err("Unknown freezer state: %s", path);
>>> +err:
>>> + return NULL;
>>> +}
>>> +
>>> +static bool freezer_thawed;
>>> +
>>> +static int freezer_restore_state(void)
>>> +{
>>> + int fd;
>>> + char path[PATH_MAX];
>>> +
>>> + if (!opts.freeze_cgroup || freezer_thawed)
>>> + return 0;
>>> +
>>> + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
>>> + fd = open(path, O_RDWR);
>>> + if (fd < 0) {
>>> + pr_perror("Unable to open %s", path);
>>> + return -1;
>>> + }
>>> +
>>> + if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
>>> + pr_perror("Unable to freeze tasks");
>>> + close(fd);
>>> + return -1;
>>> + }
>>> + close(fd);
>>> + return 0;
>>> +}
>>> +
>>> +static int freeze_processes(void)
>>> +{
>>> + int i, ret, fd, exit_code = -1;
>>> + char path[PATH_MAX];
>>> + const char *state = thawed;
>>> + FILE *f;
>>> +
>>> + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
>>> + fd = open(path, O_RDWR);
>>> + if (fd < 0) {
>>> + pr_perror("Unable to open %s", path);
>>> + return -1;
>>> + }
>>> + state = get_freezer_state(fd);
>>> + if (!state) {
>>> + close(fd);
>>> + return -1;
>>> + }
>>> + if (state == thawed)
>>> + freezer_thawed = true;
>>> +
>>> + lseek(fd, 0, SEEK_SET);
>>> + if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
>>> + pr_perror("Unable to freeze tasks");
>>> + close(fd);
>>> + return -1;
>>> + }
>>> +
>>> + /*
>>> + * There is not way to wait a specified state, so we need to poll the
>>> + * freezer.state
>>> + */
>>> + for (i = 0; i < NR_ATTEMPTS; i++) {
>>> + struct timespec req = {};
>>> +
>>> + snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
>>> + f = fopen(path, "r");
>>> + if (f == NULL) {
>>> + pr_perror("Unable to open %s", path);
>>> + goto err;
>>> + }
>>> + while (fgets(path, sizeof(path), f)) {
>>> + pid_t pid;
>>> +
>>> + pid = atoi(path);
>>> +
>>> + ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
>>> + if (ret == 0) /* skip already seized tasks */
>>> + continue;
>>> + if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE)) {
>>> + fclose(f);
>>> + goto err;
>>> + }
>>> + }
>>> + fclose(f);
>>> +
>>> + if (state == frozen)
>>> + break;
>>> +
>>> + state = get_freezer_state(fd);
>>> + if (!state)
>>> + goto err;
>>> +
>>> + if (state == frozen) {
>>> + /*
>>> + * Enumerate all tasks one more time to collect all new
>>> + * tasks, which can be born while the cgroup is being frozen.
>>> + */
>>> +
>>> + continue;
>>> + }
>>> +
>>> + req.tv_nsec = 10000000 * i;
>>> + nanosleep(&req, NULL);
>>> + }
>>> +
>>> + if (i == NR_ATTEMPTS) {
>>> + pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
>>> + goto err;
>>> + }
>>> +
>>> + exit_code = 0;
>>> +err:
>>> + if (exit_code == 0 || freezer_thawed) {
>>> + lseek(fd, 0, SEEK_SET);
>>> + if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
>>> + pr_perror("Unable to thaw tasks");
>>> + exit_code = -1;
>>> + }
>>> + }
>>
>> I guess you don't need to poll freezer.state here, because there aren't
>> further events that are dependent on the transition.
>
> Yes, you are right. I will add a comment here.
>
>>
>>> + if (close(fd)) {
>>> + pr_perror("Unable to thaw tasks");
>>> + return -1;
>>> + }
>>> +
>>> + return exit_code;
>>> +}
>>> +
>>> static inline bool child_collected(struct pstree_item *i, pid_t pid)
>>> {
>>> struct pstree_item *c;
>>> @@ -33,7 +190,9 @@ static int collect_task(struct pstree_item *item);
>>> static int collect_children(struct pstree_item *item)
>>> {
>>> pid_t *ch;
>>> - int ret, i, nr_children, nr_inprogress;
>>> + int ret, i, nr_children, nr_inprogress, freeze;
>>> +
>>> + freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>>>
>>> ret = parse_children(item->pid.real, &ch, &nr_children);
>>> if (ret < 0)
>>> @@ -58,7 +217,7 @@ static int collect_children(struct pstree_item *item)
>>> goto free;
>>> }
>>>
>>> - ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
>>> + ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
>>> if (ret < 0) {
>>> /*
>>> * Here is a race window between parse_children() and seize(),
>>> @@ -143,6 +302,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st)
>>> {
>>> struct pstree_item *item = root_item;
>>>
>>> + if (st != TASK_DEAD)
>>> + freezer_restore_state();
>>> +
>>> pr_info("Unfreezing tasks into %d\n", st);
>>> for_each_pstree_item(item)
>>> unseize_task_and_threads(item, st);
>>> @@ -174,7 +336,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
>>> static int collect_threads(struct pstree_item *item)
>>> {
>>> struct pid *threads = NULL;
>>> - int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
>>> + int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
>>> +
>>> + freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>>>
>>> ret = parse_threads(item->pid.real, &threads, &nr_threads);
>>> if (ret < 0)
>>> @@ -207,7 +371,7 @@ static int collect_threads(struct pstree_item *item)
>>> pr_info("\tSeizing %d's %d thread\n",
>>> item->pid.real, pid);
>>>
>>> - ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
>>> + ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
>>> if (ret < 0) {
>>> /*
>>> * Here is a race window between parse_threads() and seize(),
>>> @@ -251,6 +415,9 @@ static int collect_loop(struct pstree_item *item,
>>> {
>>> int attempts = NR_ATTEMPTS, nr_inprogress = 1;
>>>
>>> + if (opts.freeze_cgroup)
>>> + attempts = 2; /* double check that we skip nothing */
>>
>> attempts was 5. You decrease it to 2. How does that provide additional checking?
>
> On the first attempt, we collect all tasks. On the second attempt, we
> check that we don't find new tasks (nr_inprogress == 0).
Sounds good. If you'd like to add:
Reviewed-by: Christopher Covington <cov at codeaurora.org>
--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
More information about the CRIU
mailing list