[CRIU] [PATCH] dump: when freezing fails, log unfrozen task stacks

Tycho Andersen tycho.andersen at canonical.com
Mon Jul 11 08:25:21 PDT 2016


On Mon, Jul 11, 2016 at 03:21:12PM +0000, Tycho Andersen wrote:
> When freezing, sometimes we can fail due to a timeout. If this is the case,
> sometimes it can be a kernel bug where tasks are deadlocked. It would be
> useful to know the kernel stacks to see where tasks are hung if they are.
> 
> Note that this is racy: a task might still freeze while we are logging the
> various stacks, so we might miss things. But in the case of a deadlock, it
> should catch what's wrong.

Hmm, actually, this version only logs unfrozen stack traces which was
all we needed, and it might be useful to have all of them. I'll resend
a version logging all the traces.

> Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> ---
>  criu/seize.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 79 insertions(+)
> 
> diff --git a/criu/seize.c b/criu/seize.c
> index fb78a0b..0d4a887 100644
> --- a/criu/seize.c
> +++ b/criu/seize.c
> @@ -243,6 +243,81 @@ static int freezer_detach(void)
>  	return 0;
>  }
>  
> +static int log_unfrozen_stacks(char *root)
> +{
> +	DIR *dir;
> +	struct dirent *de;
> +	char path[PATH_MAX];
> +	FILE *f;
> +
> +	snprintf(path, sizeof(path), "%s/tasks", root);
> +	f = fopen(path, "r");
> +	if (f == NULL) {
> +		pr_perror("Unable to open %s", path);
> +		return -1;
> +	}
> +	while (fgets(path, sizeof(path), f)) {
> +		pid_t pid;
> +		int ret, stack;
> +		char stackbuf[2048];
> +
> +		pid = atoi(path);
> +
> +		stack = open_proc(pid, "stack");
> +		if (stack < 0) {
> +			pr_perror("couldn't log %d's stack", pid);
> +			return -1;
> +		}
> +
> +		ret = read(stack, stackbuf, sizeof(stackbuf));
> +		close(stack);
> +		if (ret < 0) {
> +			pr_perror("couldn't read %d's stack", pid);
> +			return -1;
> +		}
> +		stackbuf[ret] = '\0';
> +
> +		if (!strstr(stackbuf, "__refrigerator"))
> +			pr_debug("Couldn't freeze %d with stack:\n%s", pid, stackbuf);
> +		else
> +			pr_debug("Froze %d\n", pid);
> +
> +	}
> +	fclose(f);
> +
> +	dir = opendir(root);
> +	if (!dir) {
> +		pr_perror("Unable to open %s", root);
> +		return -1;
> +	}
> +
> +	while ((de = readdir(dir))) {
> +		struct stat st;
> +
> +		if (dir_dots(de))
> +			continue;
> +
> +		sprintf(path, "%s/%s", root, de->d_name);
> +
> +		if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
> +			pr_perror("stat of %s failed", path);
> +			closedir(dir);
> +			return -1;
> +		}
> +
> +		if (!S_ISDIR(st.st_mode))
> +			continue;
> +
> +		if (log_unfrozen_stacks(path) < 0) {
> +			closedir(dir);
> +			return -1;
> +		}
> +	}
> +	closedir(dir);
> +
> +	return 0;
> +}
> +
>  static int freeze_processes(void)
>  {
>  	int i, fd, exit_code = -1;
> @@ -310,6 +385,10 @@ static int freeze_processes(void)
>  
>  	if (i > NR_ATTEMPTS) {
>  		pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
> +
> +		if (!pr_quelled(LOG_DEBUG))
> +			log_unfrozen_stacks(opts.freeze_cgroup);
> +
>  		goto err;
>  	}
>  
> -- 
> 2.7.4
> 


More information about the CRIU mailing list