[CRIU] [PATCH 1/2] dump: use freezer cgroup to seize processes

Christopher Covington cov at codeaurora.org
Thu Jul 23 05:38:59 PDT 2015


Hi Andrey,

On 07/22/2015 12:07 PM, Andrey Vagin wrote:
> Without using a freezer cgroup, we need to do a few iterations to catch
> all tasks, because a new tasks can be born. If new tasks appear faster
> than criu collects them, criu fails. The freezer cgroup allows to
> solve this problem.
> 
> We freeze the freezer group, then attache to tasks with ptrace and thaw
> the freezer cgroup.
> 
> We suppose that all tasks which are going to be dumped in a specified freezer
> group.

Neat. I was recently experimenting with the cgroup freezer, and while I was
not working with the latest and greatest version of CRIU, it appeared that
CRIU hung when trying to dump an already frozen process.

Right now I'm stopping processes (SIGSTOP) at certain instruction counts, and
then CRIU dumping them. But after reading about why the cgroup freezer was
created, it seemed like using it would be better as the process trees I work
with get more complex, potentially including bash with its funny SIGCONT
handling and multiple processes that need to be stopped/frozen all at the
exact same time.

How much/what kind of work do you think is required to support dumping
already-frozen cgroups?

I've made a few little code review comments below.

Thanks,
Chris

> diff --git a/cr-dump.c b/cr-dump.c
> index af1b281..1c05fdf 100644
> --- a/cr-dump.c
> +++ b/cr-dump.c
> @@ -788,7 +788,9 @@ static int collect_task(struct pstree_item *item);
>  static int collect_children(struct pstree_item *item)
>  {
>  	pid_t *ch;
> -	int ret, i, nr_children, nr_inprogress;
> +	int ret, i, nr_children, nr_inprogress, freeze;
> +
> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>  
>  	ret = parse_children(item->pid.real, &ch, &nr_children);
>  	if (ret < 0)
> @@ -813,7 +815,7 @@ static int collect_children(struct pstree_item *item)
>  			goto free;
>  		}
>  
> -		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
> +		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
>  		if (ret < 0) {
>  			/*
>  			 * Here is a race window between parse_children() and seize(),
> @@ -929,7 +931,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
>  static int collect_threads(struct pstree_item *item)
>  {
>  	struct pid *threads = NULL;
> -	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
> +	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
> +
> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>  
>  	ret = parse_threads(item->pid.real, &threads, &nr_threads);
>  	if (ret < 0)
> @@ -962,7 +966,7 @@ static int collect_threads(struct pstree_item *item)
>  		pr_info("\tSeizing %d's %d thread\n",
>  				item->pid.real, pid);
>  
> -		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
> +		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
>  		if (ret < 0) {
>  			/*
>  			 * Here is a race window between parse_threads() and seize(),
> @@ -1006,6 +1010,9 @@ static int collect_loop(struct pstree_item *item,
>  {
>  	int attempts = NR_ATTEMPTS, nr_inprogress = 1;
>  
> +	if (opts.freeze_cgroup)
> +		attempts = 2; /* double check that we skip nothing */
> +
>  	/*
>  	 * While we scan the proc and seize the children/threads
>  	 * new ones can appear (with clone(CLONE_PARENT) or with
> @@ -1101,9 +1108,110 @@ int collect_pstree_ids(void)
>  	return 0;
>  }
>  
> +#define FREEZER_ATTEMPTS 10
> +static int freeze_processes()
> +{
> +	const char frozen[] = "FROZEN";
> +	const char thawed[] = "THAWED";
> +	int i, ret, fd, exit_code = -1;
> +	char path[PATH_MAX];
> +	bool last = false;
> +	FILE *f;
> +
> +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
> +	fd = open(path, O_RDWR);
> +	if (fd < 0) {
> +		pr_perror("Unable to open %s", path);
> +		return -1;
> +	}
> +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
> +		pr_perror("Unable to freeze tasks");
> +		close(fd);
> +		return -1;
> +	}
> +
> +	/*
> +	 * There is not way to wait a specified state, so we need to poll the
> +	 * freezer.state
> +	 */
> +	for (i = 0; i < FREEZER_ATTEMPTS; i++) {
> +		struct timespec req = {};
> +
> +		snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
> +		f = fopen(path, "r");
> +		while (fgets(path, sizeof(path), f)) {
> +			pid_t pid;
> +
> +			pid = atoi(path);
> +
> +			ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
> +			if (ret == 0) /* skip already seized tasks */
> +				continue;
> +			if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE))
> +				goto err;
> +		}
> +		fclose(f);
> +
> +		if (last)
> +			break;
> +
> +		lseek(fd, 0, SEEK_SET);
> +		ret = read(fd, path, sizeof(path) - 1);
> +		if (ret <= 0) {
> +			pr_perror("Unable to get a current state");
> +			goto err;
> +		}
> +		if (path[ret - 1] == '\n')
> +			path[ret - 1] = 0;
> +		else
> +			path[ret] = 0;
> +
> +		pr_debug("freezer.state=%s\n", path);
> +		if (strcmp(path, frozen) == 0) {
> +			/*
> +			 * Enumirate all tasks one more time to collect all new

Nit: Enumerate

> +			 * tasks, which can be born while the cgroup are been freezing.

Nit: is being frozen

> +			 */
> +
> +			last = true;
> +			continue;
> +		}
> +
> +		pr_debug("freezer.state=%s instead of %s\n", path, frozen);
> +
> +		req.tv_nsec = 10000000 * i;
> +		nanosleep(&req, NULL);
> +	}
> +
> +	if (i == FREEZER_ATTEMPTS) {
> +		pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
> +		goto err;
> +	}
> +
> +	exit_code = 0;
> +err:
> +	lseek(fd, 0, SEEK_SET);
> +	if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
> +		pr_perror("Unable to thaw tasks");
> +		exit_code = -1;
> +	}
> +	if (close(fd)) {
> +		pr_perror("Unable to thaw tasks");
> +		return -1;
> +	}
> +
> +	return exit_code;
> +}
> +
>  static int collect_pstree(pid_t pid)
>  {
> -	int ret;
> +	int ret, freeze = PTRACE_FREEZE;
> +
> +	if (opts.freeze_cgroup) {
> +		if (freeze_processes())
> +			return -1;
> +		freeze = PTRACE_FREEZE_POST;
> +	}
>  
>  	timing_start(TIME_FREEZING);
>  
> @@ -1112,7 +1220,7 @@ static int collect_pstree(pid_t pid)
>  		return -1;
>  
>  	root_item->pid.real = pid;
> -	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds);
> +	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds, freeze);
>  	if (ret < 0)
>  		goto err;
>  	pr_info("Seized task %d, state %d\n", pid, ret);
> diff --git a/cr-exec.c b/cr-exec.c
> index f3d55f6..b149c38 100644
> --- a/cr-exec.c
> +++ b/cr-exec.c
> @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
>  		goto out;
>  	}
>  
> -	prev_state = ret = seize_task(pid, -1, &creds);
> +	prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
>  	if (ret < 0) {
>  		pr_err("Can't seize task %d\n", pid);
>  		goto out;
> diff --git a/crtools.c b/crtools.c
> index b085d33..f274348 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -234,6 +234,7 @@ int main(int argc, char *argv[], char *envp[])
>  		{ "enable-fs",			required_argument,	0, 1065 },
>  		{ "enable-external-sharing", 	no_argument, 		0, 1066 },
>  		{ "enable-external-masters", 	no_argument, 		0, 1067 },
> +		{ "freeze-cgroup",		required_argument,	0, 1068 },
>  		{ },
>  	};
>  
> @@ -462,6 +463,9 @@ int main(int argc, char *argv[], char *envp[])
>  		case 1067:
>  			opts.enable_external_masters = true;
>  			break;
> +		case 1068:
> +			opts.freeze_cgroup = optarg;
> +			break;
>  		case 'M':
>  			{
>  				char *aux;
> @@ -673,6 +677,8 @@ usage:
>  "                        'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
>  "     --exec-cmd         execute the command specified after '--' on successful\n"
>  "                        restore making it the parent of the restored process\n"
> +"  --freeze-cgroup\n"
> +"                        use freeze cgroup to collect processes\n"

Nit: cgroup freezer

>  "\n"
>  "* Special resources support:\n"
>  "  -x|--" USK_EXT_PARAM "      allow external unix connections\n"
> diff --git a/include/cr_options.h b/include/cr_options.h
> index 9ab8bba..09b0794 100644
> --- a/include/cr_options.h
> +++ b/include/cr_options.h
> @@ -56,6 +56,7 @@ struct cr_options {
>  	char			*output;
>  	char			*root;
>  	char			*pidfile;
> +	char			*freeze_cgroup;
>  	struct list_head	veth_pairs;
>  	struct list_head	scripts;
>  	struct list_head	ext_mounts;
> diff --git a/include/ptrace.h b/include/ptrace.h
> index 44b66c9..568e51e 100644
> --- a/include/ptrace.h
> +++ b/include/ptrace.h
> @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
>  
>  #define SI_EVENT(_si_code)	(((_si_code) & 0xFFFF) >> 8)
>  
> -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
> +#define PTRACE_FREEZE		0
> +#define PTRACE_FREEZE_PRE	1
> +#define PTRACE_FREEZE_POST	2

I don't understand what these mean in my first read through. Do you think you
could briefly document them, either in the source or commit message?

> +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
>  extern int suspend_seccomp(pid_t pid);
>  extern int unseize_task(pid_t pid, int orig_state, int state);
>  extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
> diff --git a/proc_parse.c b/proc_parse.c
> index ed78d70..251cf3b 100644
> --- a/proc_parse.c
> +++ b/proc_parse.c
> @@ -1162,6 +1162,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
>  
>  		new->nsid = nsid;
>  
> +		pr_debug("%s\n", str);
>  		ret = parse_mountinfo_ent(str, new, &fsname);
>  		if (ret < 0) {
>  			pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
> diff --git a/ptrace.c b/ptrace.c
> index 905eaec..dc96cfa 100644
> --- a/ptrace.c
> +++ b/ptrace.c
> @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
>   * up with someone else.
>   */
>  
> -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
>  {
>  	siginfo_t si;
>  	int status;
> -	int ret, ret2, ptrace_errno, wait_errno = 0;
> +	int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
>  	struct proc_status_creds cr;
>  
>  	/*
> @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>  	 */
>  	memzero(&cr, sizeof(struct proc_status_creds));
>  
> -	ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> -	ptrace_errno = errno;
> -	if (ret == 0) {
> -		/*
> -		 * If we SEIZE-d the task stop it before going
> -		 * and reading its stat from proc. Otherwise task
> -		 * may die _while_ we're doing it and we'll have
> -		 * inconsistent seize/state pair.
> -		 *
> -		 * If task dies after we seize it but before we
> -		 * do this interrupt, we'll notice it via proc.
> -		 */
> -		ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> -		if (ret < 0) {
> -			pr_perror("SEIZE %d: can't interrupt task", pid);
> -			ptrace(PTRACE_DETACH, pid, NULL, NULL);
> -			goto err;
> +	if (freeze != 2) {

PTRACE_FREEZE_POST ?

> +		ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> +		ptrace_errno = errno;
> +		if (ret == 0) {
> +			/*
> +			 * If we SEIZE-d the task stop it before going
> +			 * and reading its stat from proc. Otherwise task
> +			 * may die _while_ we're doing it and we'll have
> +			 * inconsistent seize/state pair.
> +			 *
> +			 * If task dies after we seize it but before we
> +			 * do this interrupt, we'll notice it via proc.
> +			 */
> +			ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> +			if (ret < 0) {
> +				pr_perror("SEIZE %d: can't interrupt task", pid);
> +				ptrace(PTRACE_DETACH, pid, NULL, NULL);
> +				goto err;
> +			}
>  		}
>  	}
>  
> @@ -101,7 +103,16 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>  
>  try_again:
>  	if (!ret) {
> -		ret = wait4(pid, &status, __WALL, NULL);
> +		int flags = 0;
> +
> +		if (freeze == 1)

PTRACE_FREEZE_PRE ?

> +			flags |= WNOHANG;
> +
> +		ret = wait4(pid, &status, __WALL | flags, NULL);
> +		if (freeze == 1 && ret > 0) {

PTRACE_FREEZE_PRE ?

> +			pr_err("The %d task is not frozen\n", pid);
> +			goto err;
> +		}
>  		wait_errno = errno;
>  	}
>  
> @@ -126,6 +137,8 @@ try_again:
>  
>  		return TASK_DEAD;
>  	}
> +	if (freeze == 1)

PTRACE_FREEZE_PRE ?

> +		return 0;
>  
>  	if ((ppid != -1) && (cr.ppid != ppid)) {
>  		pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
> 


-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


More information about the CRIU mailing list