[CRIU] [PATCH 3/3] dump: use freezer cgroup to seize processes (v2)
Andrey Vagin
avagin at openvz.org
Mon Aug 3 19:18:24 PDT 2015
Without using a freezer cgroup, we need to do a few iterations to catch
all tasks, because a new tasks can be born. If new tasks appear faster
than criu collects them, criu fails. The freezer cgroup allows to
solve this problem.
We freeze the freezer group, then attaches to tasks with ptrace and thaw
the freezer cgroup. We suppose that all tasks which are going to be
dumped in a specified freezer group.
v2: fix comments from Christopher
Cc: Christopher Covington <cov at codeaurora.org>
Signed-off-by: Andrey Vagin <avagin at openvz.org>
---
cr-exec.c | 2 +-
crtools.c | 6 ++
include/cr_options.h | 1 +
include/ptrace.h | 5 +-
proc_parse.c | 1 +
ptrace.c | 54 +++++++++------
seize.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++++--
7 files changed, 226 insertions(+), 28 deletions(-)
diff --git a/cr-exec.c b/cr-exec.c
index f3d55f6..b149c38 100644
--- a/cr-exec.c
+++ b/cr-exec.c
@@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
goto out;
}
- prev_state = ret = seize_task(pid, -1, &creds);
+ prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
if (ret < 0) {
pr_err("Can't seize task %d\n", pid);
goto out;
diff --git a/crtools.c b/crtools.c
index 6af6080..9096420 100644
--- a/crtools.c
+++ b/crtools.c
@@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
{ "enable-fs", required_argument, 0, 1065 },
{ "enable-external-sharing", no_argument, 0, 1066 },
{ "enable-external-masters", no_argument, 0, 1067 },
+ { "freeze-cgroup", required_argument, 0, 1068 },
{ },
};
@@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
case 1067:
opts.enable_external_masters = true;
break;
+ case 1068:
+ opts.freeze_cgroup = optarg;
+ break;
case 'M':
{
char *aux;
@@ -676,6 +680,8 @@ usage:
" 'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
" --exec-cmd execute the command specified after '--' on successful\n"
" restore making it the parent of the restored process\n"
+" --freeze-cgroup\n"
+" use cgroup freezer to collect processes\n"
"\n"
"* Special resources support:\n"
" -x|--" USK_EXT_PARAM "inode,.." " allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
diff --git a/include/cr_options.h b/include/cr_options.h
index 19c2f77..f981806 100644
--- a/include/cr_options.h
+++ b/include/cr_options.h
@@ -57,6 +57,7 @@ struct cr_options {
char *output;
char *root;
char *pidfile;
+ char *freeze_cgroup;
struct list_head veth_pairs;
struct list_head scripts;
struct list_head ext_mounts;
diff --git a/include/ptrace.h b/include/ptrace.h
index 44b66c9..ce91a45 100644
--- a/include/ptrace.h
+++ b/include/ptrace.h
@@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
-extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
+#define PTRACE_FREEZE_PRE 0 /* Attach to the process (PTRACE_SEIZE and PTRACE_INTERRUPT) */
+#define PTRACE_FREEZE_POST 1 /* Wait when the process will be stopped */
+#define PTRACE_FREEZE 2 /* Do both stages together */
+extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
extern int suspend_seccomp(pid_t pid);
extern int unseize_task(pid_t pid, int orig_state, int state);
extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
diff --git a/proc_parse.c b/proc_parse.c
index 95d6505..ea98a97 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -1135,6 +1135,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
new->nsid = nsid;
+ pr_debug("%s\n", str);
ret = parse_mountinfo_ent(str, new, &fsname);
if (ret < 0) {
pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
diff --git a/ptrace.c b/ptrace.c
index 905eaec..e614be4 100644
--- a/ptrace.c
+++ b/ptrace.c
@@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
* up with someone else.
*/
-int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
+int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
{
siginfo_t si;
int status;
- int ret, ret2, ptrace_errno, wait_errno = 0;
+ int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
struct proc_status_creds cr;
/*
@@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
*/
memzero(&cr, sizeof(struct proc_status_creds));
- ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
- ptrace_errno = errno;
- if (ret == 0) {
- /*
- * If we SEIZE-d the task stop it before going
- * and reading its stat from proc. Otherwise task
- * may die _while_ we're doing it and we'll have
- * inconsistent seize/state pair.
- *
- * If task dies after we seize it but before we
- * do this interrupt, we'll notice it via proc.
- */
- ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
- if (ret < 0) {
- pr_perror("SEIZE %d: can't interrupt task", pid);
- ptrace(PTRACE_DETACH, pid, NULL, NULL);
- goto err;
+ if (freeze != PTRACE_FREEZE_POST) {
+ ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
+ ptrace_errno = errno;
+ if (ret == 0) {
+ /*
+ * If we SEIZE-d the task stop it before going
+ * and reading its stat from proc. Otherwise task
+ * may die _while_ we're doing it and we'll have
+ * inconsistent seize/state pair.
+ *
+ * If task dies after we seize it but before we
+ * do this interrupt, we'll notice it via proc.
+ */
+ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
+ if (ret < 0) {
+ pr_perror("SEIZE %d: can't interrupt task", pid);
+ ptrace(PTRACE_DETACH, pid, NULL, NULL);
+ goto err;
+ }
}
}
@@ -101,8 +103,18 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
try_again:
if (!ret) {
- ret = wait4(pid, &status, __WALL, NULL);
+ int flags = 0;
+
+ if (freeze == PTRACE_FREEZE_PRE)
+ flags |= WNOHANG;
+
+ ret = wait4(pid, &status, __WALL | flags, NULL);
wait_errno = errno;
+
+ if (freeze == PTRACE_FREEZE_PRE && ret > 0) {
+ pr_err("The %d task is not frozen\n", pid);
+ goto err;
+ }
}
ret2 = parse_pid_status(pid, &cr);
@@ -126,6 +138,8 @@ try_again:
return TASK_DEAD;
}
+ if (freeze == PTRACE_FREEZE_PRE)
+ return 0;
if ((ppid != -1) && (cr.ppid != ppid)) {
pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
diff --git a/seize.c b/seize.c
index e9be332..6ed664f 100644
--- a/seize.c
+++ b/seize.c
@@ -18,6 +18,163 @@
#define NR_ATTEMPTS 5
+const char frozen[] = "FROZEN";
+const char freezing[] = "FREEZING";
+const char thawed[] = "THAWED";
+
+static const char *get_freezer_state(int fd)
+{
+ int ret;
+ char path[PATH_MAX];
+
+ lseek(fd, 0, SEEK_SET);
+ ret = read(fd, path, sizeof(path) - 1);
+ if (ret <= 0) {
+ pr_perror("Unable to get a current state");
+ goto err;
+ }
+ if (path[ret - 1] == '\n')
+ path[ret - 1] = 0;
+ else
+ path[ret] = 0;
+
+ pr_debug("freezer.state=%s\n", path);
+ if (strcmp(path, frozen) == 0)
+ return frozen;
+ if (strcmp(path, freezing) == 0)
+ return freezing;
+ if (strcmp(path, thawed) == 0)
+ return thawed;
+
+ pr_err("Unknown freezer state: %s", path);
+err:
+ return NULL;
+}
+
+static bool freezer_thawed;
+
+static int freezer_restore_state(void)
+{
+ int fd;
+ char path[PATH_MAX];
+
+ if (!opts.freeze_cgroup || freezer_thawed)
+ return 0;
+
+ snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+
+ if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
+ pr_perror("Unable to freeze tasks");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ return 0;
+}
+
+static int freeze_processes(void)
+{
+ int i, ret, fd, exit_code = -1;
+ char path[PATH_MAX];
+ const char *state = thawed;
+ FILE *f;
+
+ snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+ state = get_freezer_state(fd);
+ if (!state) {
+ close(fd);
+ return -1;
+ }
+ if (state == thawed)
+ freezer_thawed = true;
+
+ lseek(fd, 0, SEEK_SET);
+ if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
+ pr_perror("Unable to freeze tasks");
+ close(fd);
+ return -1;
+ }
+
+ /*
+ * There is not way to wait a specified state, so we need to poll the
+ * freezer.state
+ */
+ for (i = 0; i < NR_ATTEMPTS; i++) {
+ struct timespec req = {};
+
+ snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
+ f = fopen(path, "r");
+ if (f == NULL) {
+ pr_perror("Unable to open %s", path);
+ goto err;
+ }
+ while (fgets(path, sizeof(path), f)) {
+ pid_t pid;
+
+ pid = atoi(path);
+
+ ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
+ if (ret == 0) /* skip already seized tasks */
+ continue;
+ if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE)) {
+ fclose(f);
+ goto err;
+ }
+ }
+ fclose(f);
+
+ if (state == frozen)
+ break;
+
+ state = get_freezer_state(fd);
+ if (!state)
+ goto err;
+
+ if (state == frozen) {
+ /*
+ * Enumerate all tasks one more time to collect all new
+ * tasks, which can be born while the cgroup is being frozen.
+ */
+
+ continue;
+ }
+
+ req.tv_nsec = 10000000 * i;
+ nanosleep(&req, NULL);
+ }
+
+ if (i == NR_ATTEMPTS) {
+ pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
+ goto err;
+ }
+
+ exit_code = 0;
+err:
+ if (exit_code == 0 || freezer_thawed) {
+ lseek(fd, 0, SEEK_SET);
+ if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
+ pr_perror("Unable to thaw tasks");
+ exit_code = -1;
+ }
+ }
+ if (close(fd)) {
+ pr_perror("Unable to thaw tasks");
+ return -1;
+ }
+
+ return exit_code;
+}
+
static inline bool child_collected(struct pstree_item *i, pid_t pid)
{
struct pstree_item *c;
@@ -33,7 +190,9 @@ static int collect_task(struct pstree_item *item);
static int collect_children(struct pstree_item *item)
{
pid_t *ch;
- int ret, i, nr_children, nr_inprogress;
+ int ret, i, nr_children, nr_inprogress, freeze;
+
+ freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
ret = parse_children(item->pid.real, &ch, &nr_children);
if (ret < 0)
@@ -58,7 +217,7 @@ static int collect_children(struct pstree_item *item)
goto free;
}
- ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
+ ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
if (ret < 0) {
/*
* Here is a race window between parse_children() and seize(),
@@ -143,6 +302,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st)
{
struct pstree_item *item = root_item;
+ if (st != TASK_DEAD)
+ freezer_restore_state();
+
pr_info("Unfreezing tasks into %d\n", st);
for_each_pstree_item(item)
unseize_task_and_threads(item, st);
@@ -174,7 +336,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
static int collect_threads(struct pstree_item *item)
{
struct pid *threads = NULL;
- int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
+ int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
+
+ freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
ret = parse_threads(item->pid.real, &threads, &nr_threads);
if (ret < 0)
@@ -207,7 +371,7 @@ static int collect_threads(struct pstree_item *item)
pr_info("\tSeizing %d's %d thread\n",
item->pid.real, pid);
- ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
+ ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
if (ret < 0) {
/*
* Here is a race window between parse_threads() and seize(),
@@ -251,6 +415,9 @@ static int collect_loop(struct pstree_item *item,
{
int attempts = NR_ATTEMPTS, nr_inprogress = 1;
+ if (opts.freeze_cgroup)
+ attempts = 2; /* double check that we skip nothing */
+
/*
* While we scan the proc and seize the children/threads
* new ones can appear (with clone(CLONE_PARENT) or with
@@ -307,7 +474,13 @@ err_close:
int collect_pstree(pid_t pid)
{
- int ret;
+ int ret, freeze = PTRACE_FREEZE;
+
+ if (opts.freeze_cgroup) {
+ if (freeze_processes())
+ return -1;
+ freeze = PTRACE_FREEZE_POST;
+ }
timing_start(TIME_FREEZING);
@@ -316,7 +489,7 @@ int collect_pstree(pid_t pid)
return -1;
root_item->pid.real = pid;
- ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds);
+ ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds, freeze);
if (ret < 0)
goto err;
pr_info("Seized task %d, state %d\n", pid, ret);
--
2.1.0
More information about the CRIU
mailing list