[CRIU] [PATCH 1/2] dump: use freezer cgroup to seize processes

Thu Jul 23 06:18:47 PDT 2015

On Thu, Jul 23, 2015 at 08:38:59AM -0400, Christopher Covington wrote:
> Hi Andrey,
> 
> On 07/22/2015 12:07 PM, Andrey Vagin wrote:
> > Without using a freezer cgroup, we need to do a few iterations to catch
> > all tasks, because a new tasks can be born. If new tasks appear faster
> > than criu collects them, criu fails. The freezer cgroup allows to
> > solve this problem.
> > 
> > We freeze the freezer group, then attache to tasks with ptrace and thaw
> > the freezer cgroup.
> > 
> > We suppose that all tasks which are going to be dumped in a specified freezer
> > group.
> 
> Neat. I was recently experimenting with the cgroup freezer, and while I was
> not working with the latest and greatest version of CRIU, it appeared that
> CRIU hung when trying to dump an already frozen process.
> 
> Right now I'm stopping processes (SIGSTOP) at certain instruction counts, and
> then CRIU dumping them. But after reading about why the cgroup freezer was
> created, it seemed like using it would be better as the process trees I work
> with get more complex, potentially including bash with its funny SIGCONT
> handling and multiple processes that need to be stopped/frozen all at the
> exact same time.
> 
> How much/what kind of work do you think is required to support dumping
> already-frozen cgroups?

This version allows to dump processes in an already-frozen group. CRIU
attaches to all processes and then unfreezes the groop, because we are
going to execute the parasite code in porcesses.

> 
> I've made a few little code review comments below.

Thank you for comments! I will fix.

> 
> Thanks,
> Chris
> 
> > diff --git a/cr-dump.c b/cr-dump.c
> > index af1b281..1c05fdf 100644
> > --- a/cr-dump.c
> > +++ b/cr-dump.c
> > @@ -788,7 +788,9 @@ static int collect_task(struct pstree_item *item);
> >  static int collect_children(struct pstree_item *item)
> >  {
> >  	pid_t *ch;
> > -	int ret, i, nr_children, nr_inprogress;
> > +	int ret, i, nr_children, nr_inprogress, freeze;
> > +
> > +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
> >  
> >  	ret = parse_children(item->pid.real, &ch, &nr_children);
> >  	if (ret < 0)
> > @@ -813,7 +815,7 @@ static int collect_children(struct pstree_item *item)
> >  			goto free;
> >  		}
> >  
> > -		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
> > +		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
> >  		if (ret < 0) {
> >  			/*
> >  			 * Here is a race window between parse_children() and seize(),
> > @@ -929,7 +931,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
> >  static int collect_threads(struct pstree_item *item)
> >  {
> >  	struct pid *threads = NULL;
> > -	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
> > +	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
> > +
> > +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
> >  
> >  	ret = parse_threads(item->pid.real, &threads, &nr_threads);
> >  	if (ret < 0)
> > @@ -962,7 +966,7 @@ static int collect_threads(struct pstree_item *item)
> >  		pr_info("\tSeizing %d's %d thread\n",
> >  				item->pid.real, pid);
> >  
> > -		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
> > +		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
> >  		if (ret < 0) {
> >  			/*
> >  			 * Here is a race window between parse_threads() and seize(),
> > @@ -1006,6 +1010,9 @@ static int collect_loop(struct pstree_item *item,
> >  {
> >  	int attempts = NR_ATTEMPTS, nr_inprogress = 1;
> >  
> > +	if (opts.freeze_cgroup)
> > +		attempts = 2; /* double check that we skip nothing */
> > +
> >  	/*
> >  	 * While we scan the proc and seize the children/threads
> >  	 * new ones can appear (with clone(CLONE_PARENT) or with
> > @@ -1101,9 +1108,110 @@ int collect_pstree_ids(void)
> >  	return 0;
> >  }
> >  
> > +#define FREEZER_ATTEMPTS 10
> > +static int freeze_processes()
> > +{
> > +	const char frozen[] = "FROZEN";
> > +	const char thawed[] = "THAWED";
> > +	int i, ret, fd, exit_code = -1;
> > +	char path[PATH_MAX];
> > +	bool last = false;
> > +	FILE *f;
> > +
> > +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
> > +	fd = open(path, O_RDWR);
> > +	if (fd < 0) {
> > +		pr_perror("Unable to open %s", path);
> > +		return -1;
> > +	}
> > +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
> > +		pr_perror("Unable to freeze tasks");
> > +		close(fd);
> > +		return -1;
> > +	}
> > +
> > +	/*
> > +	 * There is not way to wait a specified state, so we need to poll the
> > +	 * freezer.state
> > +	 */
> > +	for (i = 0; i < FREEZER_ATTEMPTS; i++) {
> > +		struct timespec req = {};
> > +
> > +		snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
> > +		f = fopen(path, "r");
> > +		while (fgets(path, sizeof(path), f)) {
> > +			pid_t pid;
> > +
> > +			pid = atoi(path);
> > +
> > +			ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
> > +			if (ret == 0) /* skip already seized tasks */
> > +				continue;
> > +			if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE))
> > +				goto err;
> > +		}
> > +		fclose(f);
> > +
> > +		if (last)
> > +			break;
> > +
> > +		lseek(fd, 0, SEEK_SET);
> > +		ret = read(fd, path, sizeof(path) - 1);
> > +		if (ret <= 0) {
> > +			pr_perror("Unable to get a current state");
> > +			goto err;
> > +		}
> > +		if (path[ret - 1] == '\n')
> > +			path[ret - 1] = 0;
> > +		else
> > +			path[ret] = 0;
> > +
> > +		pr_debug("freezer.state=%s\n", path);
> > +		if (strcmp(path, frozen) == 0) {
> > +			/*
> > +			 * Enumirate all tasks one more time to collect all new
> 
> Nit: Enumerate
> 
> > +			 * tasks, which can be born while the cgroup are been freezing.
> 
> Nit: is being frozen
> 
> > +			 */
> > +
> > +			last = true;
> > +			continue;
> > +		}
> > +
> > +		pr_debug("freezer.state=%s instead of %s\n", path, frozen);
> > +
> > +		req.tv_nsec = 10000000 * i;
> > +		nanosleep(&req, NULL);
> > +	}
> > +
> > +	if (i == FREEZER_ATTEMPTS) {
> > +		pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
> > +		goto err;
> > +	}
> > +
> > +	exit_code = 0;
> > +err:
> > +	lseek(fd, 0, SEEK_SET);
> > +	if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
> > +		pr_perror("Unable to thaw tasks");
> > +		exit_code = -1;
> > +	}
> > +	if (close(fd)) {
> > +		pr_perror("Unable to thaw tasks");
> > +		return -1;
> > +	}
> > +
> > +	return exit_code;
> > +}
> > +
> >  static int collect_pstree(pid_t pid)
> >  {
> > -	int ret;
> > +	int ret, freeze = PTRACE_FREEZE;
> > +
> > +	if (opts.freeze_cgroup) {
> > +		if (freeze_processes())
> > +			return -1;
> > +		freeze = PTRACE_FREEZE_POST;
> > +	}
> >  
> >  	timing_start(TIME_FREEZING);
> >  
> > @@ -1112,7 +1220,7 @@ static int collect_pstree(pid_t pid)
> >  		return -1;
> >  
> >  	root_item->pid.real = pid;
> > -	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds);
> > +	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds, freeze);
> >  	if (ret < 0)
> >  		goto err;
> >  	pr_info("Seized task %d, state %d\n", pid, ret);
> > diff --git a/cr-exec.c b/cr-exec.c
> > index f3d55f6..b149c38 100644
> > --- a/cr-exec.c
> > +++ b/cr-exec.c
> > @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
> >  		goto out;
> >  	}
> >  
> > -	prev_state = ret = seize_task(pid, -1, &creds);
> > +	prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
> >  	if (ret < 0) {
> >  		pr_err("Can't seize task %d\n", pid);
> >  		goto out;
> > diff --git a/crtools.c b/crtools.c
> > index b085d33..f274348 100644
> > --- a/crtools.c
> > +++ b/crtools.c
> > @@ -234,6 +234,7 @@ int main(int argc, char *argv[], char *envp[])
> >  		{ "enable-fs",			required_argument,	0, 1065 },
> >  		{ "enable-external-sharing", 	no_argument, 		0, 1066 },
> >  		{ "enable-external-masters", 	no_argument, 		0, 1067 },
> > +		{ "freeze-cgroup",		required_argument,	0, 1068 },
> >  		{ },
> >  	};
> >  
> > @@ -462,6 +463,9 @@ int main(int argc, char *argv[], char *envp[])
> >  		case 1067:
> >  			opts.enable_external_masters = true;
> >  			break;
> > +		case 1068:
> > +			opts.freeze_cgroup = optarg;
> > +			break;
> >  		case 'M':
> >  			{
> >  				char *aux;
> > @@ -673,6 +677,8 @@ usage:
> >  "                        'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
> >  "     --exec-cmd         execute the command specified after '--' on successful\n"
> >  "                        restore making it the parent of the restored process\n"
> > +"  --freeze-cgroup\n"
> > +"                        use freeze cgroup to collect processes\n"
> 
> Nit: cgroup freezer
> 
> >  "\n"
> >  "* Special resources support:\n"
> >  "  -x|--" USK_EXT_PARAM "      allow external unix connections\n"
> > diff --git a/include/cr_options.h b/include/cr_options.h
> > index 9ab8bba..09b0794 100644
> > --- a/include/cr_options.h
> > +++ b/include/cr_options.h
> > @@ -56,6 +56,7 @@ struct cr_options {
> >  	char			*output;
> >  	char			*root;
> >  	char			*pidfile;
> > +	char			*freeze_cgroup;
> >  	struct list_head	veth_pairs;
> >  	struct list_head	scripts;
> >  	struct list_head	ext_mounts;
> > diff --git a/include/ptrace.h b/include/ptrace.h
> > index 44b66c9..568e51e 100644
> > --- a/include/ptrace.h
> > +++ b/include/ptrace.h
> > @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
> >  
> >  #define SI_EVENT(_si_code)	(((_si_code) & 0xFFFF) >> 8)
> >  
> > -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
> > +#define PTRACE_FREEZE		0
> > +#define PTRACE_FREEZE_PRE	1
> > +#define PTRACE_FREEZE_POST	2
> 
> I don't understand what these mean in my first read through. Do you think you
> could briefly document them, either in the source or commit message?
> 
> > +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
> >  extern int suspend_seccomp(pid_t pid);
> >  extern int unseize_task(pid_t pid, int orig_state, int state);
> >  extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
> > diff --git a/proc_parse.c b/proc_parse.c
> > index ed78d70..251cf3b 100644
> > --- a/proc_parse.c
> > +++ b/proc_parse.c
> > @@ -1162,6 +1162,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
> >  
> >  		new->nsid = nsid;
> >  
> > +		pr_debug("%s\n", str);
> >  		ret = parse_mountinfo_ent(str, new, &fsname);
> >  		if (ret < 0) {
> >  			pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
> > diff --git a/ptrace.c b/ptrace.c
> > index 905eaec..dc96cfa 100644
> > --- a/ptrace.c
> > +++ b/ptrace.c
> > @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
> >   * up with someone else.
> >   */
> >  
> > -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> > +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
> >  {
> >  	siginfo_t si;
> >  	int status;
> > -	int ret, ret2, ptrace_errno, wait_errno = 0;
> > +	int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
> >  	struct proc_status_creds cr;
> >  
> >  	/*
> > @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> >  	 */
> >  	memzero(&cr, sizeof(struct proc_status_creds));
> >  
> > -	ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> > -	ptrace_errno = errno;
> > -	if (ret == 0) {
> > -		/*
> > -		 * If we SEIZE-d the task stop it before going
> > -		 * and reading its stat from proc. Otherwise task
> > -		 * may die _while_ we're doing it and we'll have
> > -		 * inconsistent seize/state pair.
> > -		 *
> > -		 * If task dies after we seize it but before we
> > -		 * do this interrupt, we'll notice it via proc.
> > -		 */
> > -		ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> > -		if (ret < 0) {
> > -			pr_perror("SEIZE %d: can't interrupt task", pid);
> > -			ptrace(PTRACE_DETACH, pid, NULL, NULL);
> > -			goto err;
> > +	if (freeze != 2) {
> 
> PTRACE_FREEZE_POST ?
> 
> > +		ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> > +		ptrace_errno = errno;
> > +		if (ret == 0) {
> > +			/*
> > +			 * If we SEIZE-d the task stop it before going
> > +			 * and reading its stat from proc. Otherwise task
> > +			 * may die _while_ we're doing it and we'll have
> > +			 * inconsistent seize/state pair.
> > +			 *
> > +			 * If task dies after we seize it but before we
> > +			 * do this interrupt, we'll notice it via proc.
> > +			 */
> > +			ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> > +			if (ret < 0) {
> > +				pr_perror("SEIZE %d: can't interrupt task", pid);
> > +				ptrace(PTRACE_DETACH, pid, NULL, NULL);
> > +				goto err;
> > +			}
> >  		}
> >  	}
> >  
> > @@ -101,7 +103,16 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> >  
> >  try_again:
> >  	if (!ret) {
> > -		ret = wait4(pid, &status, __WALL, NULL);
> > +		int flags = 0;
> > +
> > +		if (freeze == 1)
> 
> PTRACE_FREEZE_PRE ?
> 
> > +			flags |= WNOHANG;
> > +
> > +		ret = wait4(pid, &status, __WALL | flags, NULL);
> > +		if (freeze == 1 && ret > 0) {
> 
> PTRACE_FREEZE_PRE ?
> 
> > +			pr_err("The %d task is not frozen\n", pid);
> > +			goto err;
> > +		}
> >  		wait_errno = errno;
> >  	}
> >  
> > @@ -126,6 +137,8 @@ try_again:
> >  
> >  		return TASK_DEAD;
> >  	}
> > +	if (freeze == 1)
> 
> PTRACE_FREEZE_PRE ?
> 
> > +		return 0;
> >  
> >  	if ((ppid != -1) && (cr.ppid != ppid)) {
> >  		pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
> > 
> 
> 
> -- 
> Qualcomm Innovation Center, Inc.
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project