[CRIU] [PATCH] dump: when freezing fails, log unfrozen task stacks
Tycho Andersen
tycho.andersen at canonical.com
Mon Jul 11 08:21:12 PDT 2016
When freezing, sometimes we can fail due to a timeout. If this is the case,
sometimes it can be a kernel bug where tasks are deadlocked. It would be
useful to know the kernel stacks to see where tasks are hung if they are.
Note that this is racy: a task might still freeze while we are logging the
various stacks, so we might miss things. But in the case of a deadlock, it
should catch what's wrong.
Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
criu/seize.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 79 insertions(+)
diff --git a/criu/seize.c b/criu/seize.c
index fb78a0b..0d4a887 100644
--- a/criu/seize.c
+++ b/criu/seize.c
@@ -243,6 +243,81 @@ static int freezer_detach(void)
return 0;
}
+static int log_unfrozen_stacks(char *root)
+{
+ DIR *dir;
+ struct dirent *de;
+ char path[PATH_MAX];
+ FILE *f;
+
+ snprintf(path, sizeof(path), "%s/tasks", root);
+ f = fopen(path, "r");
+ if (f == NULL) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+ while (fgets(path, sizeof(path), f)) {
+ pid_t pid;
+ int ret, stack;
+ char stackbuf[2048];
+
+ pid = atoi(path);
+
+ stack = open_proc(pid, "stack");
+ if (stack < 0) {
+ pr_perror("couldn't log %d's stack", pid);
+ return -1;
+ }
+
+ ret = read(stack, stackbuf, sizeof(stackbuf));
+ close(stack);
+ if (ret < 0) {
+ pr_perror("couldn't read %d's stack", pid);
+ return -1;
+ }
+ stackbuf[ret] = '\0';
+
+ if (!strstr(stackbuf, "__refrigerator"))
+ pr_debug("Couldn't freeze %d with stack:\n%s", pid, stackbuf);
+ else
+ pr_debug("Froze %d\n", pid);
+
+ }
+ fclose(f);
+
+ dir = opendir(root);
+ if (!dir) {
+ pr_perror("Unable to open %s", root);
+ return -1;
+ }
+
+ while ((de = readdir(dir))) {
+ struct stat st;
+
+ if (dir_dots(de))
+ continue;
+
+ sprintf(path, "%s/%s", root, de->d_name);
+
+ if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
+ pr_perror("stat of %s failed", path);
+ closedir(dir);
+ return -1;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ if (log_unfrozen_stacks(path) < 0) {
+ closedir(dir);
+ return -1;
+ }
+ }
+ closedir(dir);
+
+ return 0;
+}
+
static int freeze_processes(void)
{
int i, fd, exit_code = -1;
@@ -310,6 +385,10 @@ static int freeze_processes(void)
if (i > NR_ATTEMPTS) {
pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
+
+ if (!pr_quelled(LOG_DEBUG))
+ log_unfrozen_stacks(opts.freeze_cgroup);
+
goto err;
}
--
2.7.4
More information about the CRIU
mailing list