[CRIU] [PATCH 3/3] dump: use freezer cgroup to seize processes (v2)

Tue Aug 4 04:50:46 PDT 2015

On 08/03/2015 10:18 PM, Andrey Vagin wrote:
> Without using a freezer cgroup, we need to do a few iterations to catch
> all tasks, because a new tasks can be born. If new tasks appear faster
> than criu collects them, criu fails. The freezer cgroup allows to
> solve this problem.
> 
> We freeze the freezer group, then attaches to tasks with ptrace and thaw
> the freezer cgroup. We suppose that all tasks which are going to be
> dumped in a specified freezer group.
> 
> v2: fix comments from Christopher
> 
> Cc: Christopher Covington <cov at codeaurora.org>
> Signed-off-by: Andrey Vagin <avagin at openvz.org>
> ---
>  cr-exec.c            |   2 +-
>  crtools.c            |   6 ++
>  include/cr_options.h |   1 +
>  include/ptrace.h     |   5 +-
>  proc_parse.c         |   1 +
>  ptrace.c             |  54 +++++++++------
>  seize.c              | 185 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  7 files changed, 226 insertions(+), 28 deletions(-)
> 
> diff --git a/cr-exec.c b/cr-exec.c
> index f3d55f6..b149c38 100644
> --- a/cr-exec.c
> +++ b/cr-exec.c
> @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
>  		goto out;
>  	}
>  
> -	prev_state = ret = seize_task(pid, -1, &creds);
> +	prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
>  	if (ret < 0) {
>  		pr_err("Can't seize task %d\n", pid);
>  		goto out;
> diff --git a/crtools.c b/crtools.c
> index 6af6080..9096420 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>  		{ "enable-fs",			required_argument,	0, 1065 },
>  		{ "enable-external-sharing", 	no_argument, 		0, 1066 },
>  		{ "enable-external-masters", 	no_argument, 		0, 1067 },
> +		{ "freeze-cgroup",		required_argument,	0, 1068 },
>  		{ },
>  	};
>  
> @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>  		case 1067:
>  			opts.enable_external_masters = true;
>  			break;
> +		case 1068:
> +			opts.freeze_cgroup = optarg;
> +			break;
>  		case 'M':
>  			{
>  				char *aux;
> @@ -676,6 +680,8 @@ usage:
>  "                        'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
>  "     --exec-cmd         execute the command specified after '--' on successful\n"
>  "                        restore making it the parent of the restored process\n"
> +"  --freeze-cgroup\n"
> +"                        use cgroup freezer to collect processes\n"
>  "\n"
>  "* Special resources support:\n"
>  "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
> diff --git a/include/cr_options.h b/include/cr_options.h
> index 19c2f77..f981806 100644
> --- a/include/cr_options.h
> +++ b/include/cr_options.h
> @@ -57,6 +57,7 @@ struct cr_options {
>  	char			*output;
>  	char			*root;
>  	char			*pidfile;
> +	char			*freeze_cgroup;
>  	struct list_head	veth_pairs;
>  	struct list_head	scripts;
>  	struct list_head	ext_mounts;
> diff --git a/include/ptrace.h b/include/ptrace.h
> index 44b66c9..ce91a45 100644
> --- a/include/ptrace.h
> +++ b/include/ptrace.h
> @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
>  
>  #define SI_EVENT(_si_code)	(((_si_code) & 0xFFFF) >> 8)
>  
> -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
> +#define PTRACE_FREEZE_PRE	0 /* Attach to the process (PTRACE_SEIZE and PTRACE_INTERRUPT) */
> +#define PTRACE_FREEZE_POST	1 /* Wait when the process will be stopped  */
> +#define PTRACE_FREEZE		2 /* Do both stages together */
> +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
>  extern int suspend_seccomp(pid_t pid);
>  extern int unseize_task(pid_t pid, int orig_state, int state);
>  extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
> diff --git a/proc_parse.c b/proc_parse.c
> index 95d6505..ea98a97 100644
> --- a/proc_parse.c
> +++ b/proc_parse.c
> @@ -1135,6 +1135,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
>  
>  		new->nsid = nsid;
>  
> +		pr_debug("%s\n", str);
>  		ret = parse_mountinfo_ent(str, new, &fsname);
>  		if (ret < 0) {
>  			pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
> diff --git a/ptrace.c b/ptrace.c
> index 905eaec..e614be4 100644
> --- a/ptrace.c
> +++ b/ptrace.c
> @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
>   * up with someone else.
>   */
>  
> -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
> +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
>  {
>  	siginfo_t si;
>  	int status;
> -	int ret, ret2, ptrace_errno, wait_errno = 0;
> +	int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
>  	struct proc_status_creds cr;
>  
>  	/*
> @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>  	 */
>  	memzero(&cr, sizeof(struct proc_status_creds));
>  
> -	ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> -	ptrace_errno = errno;
> -	if (ret == 0) {
> -		/*
> -		 * If we SEIZE-d the task stop it before going
> -		 * and reading its stat from proc. Otherwise task
> -		 * may die _while_ we're doing it and we'll have
> -		 * inconsistent seize/state pair.
> -		 *
> -		 * If task dies after we seize it but before we
> -		 * do this interrupt, we'll notice it via proc.
> -		 */
> -		ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> -		if (ret < 0) {
> -			pr_perror("SEIZE %d: can't interrupt task", pid);
> -			ptrace(PTRACE_DETACH, pid, NULL, NULL);
> -			goto err;
> +	if (freeze != PTRACE_FREEZE_POST) {
> +		ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
> +		ptrace_errno = errno;
> +		if (ret == 0) {
> +			/*
> +			 * If we SEIZE-d the task stop it before going
> +			 * and reading its stat from proc. Otherwise task
> +			 * may die _while_ we're doing it and we'll have
> +			 * inconsistent seize/state pair.
> +			 *
> +			 * If task dies after we seize it but before we
> +			 * do this interrupt, we'll notice it via proc.
> +			 */
> +			ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
> +			if (ret < 0) {
> +				pr_perror("SEIZE %d: can't interrupt task", pid);
> +				ptrace(PTRACE_DETACH, pid, NULL, NULL);
> +				goto err;
> +			}
>  		}
>  	}
>  
> @@ -101,8 +103,18 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>  
>  try_again:
>  	if (!ret) {
> -		ret = wait4(pid, &status, __WALL, NULL);
> +		int flags = 0;
> +
> +		if (freeze == PTRACE_FREEZE_PRE)
> +			flags |= WNOHANG;
> +
> +		ret = wait4(pid, &status, __WALL | flags, NULL);
>  		wait_errno = errno;
> +
> +		if (freeze == PTRACE_FREEZE_PRE && ret > 0) {
> +			pr_err("The %d task is not frozen\n", pid);
> +			goto err;
> +		}
>  	}
>  
>  	ret2 = parse_pid_status(pid, &cr);
> @@ -126,6 +138,8 @@ try_again:
>  
>  		return TASK_DEAD;
>  	}
> +	if (freeze == PTRACE_FREEZE_PRE)
> +		return 0;
>  
>  	if ((ppid != -1) && (cr.ppid != ppid)) {
>  		pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
> diff --git a/seize.c b/seize.c
> index e9be332..6ed664f 100644
> --- a/seize.c
> +++ b/seize.c
> @@ -18,6 +18,163 @@
>  
>  #define NR_ATTEMPTS 5

There appears to be an identical definition in cr-dump.c. Perhaps they can be
consolidated into a single definition in a common header?

> +const char frozen[]	= "FROZEN";
> +const char freezing[]	= "FREEZING";
> +const char thawed[]	= "THAWED";
> +
> +static const char *get_freezer_state(int fd)
> +{
> +	int ret;
> +	char path[PATH_MAX];
> +
> +	lseek(fd, 0, SEEK_SET);
> +	ret = read(fd, path, sizeof(path) - 1);
> +	if (ret <= 0) {

I don't know if this could happen on cgroupsfs:
should you loop if 0 < ret < sizeof(path) - 1?

> +		pr_perror("Unable to get a current state");
> +		goto err;
> +	}
> +	if (path[ret - 1] == '\n')
> +		path[ret - 1] = 0;
> +	else
> +		path[ret] = 0;
> +
> +	pr_debug("freezer.state=%s\n", path);
> +	if (strcmp(path, frozen) == 0)
> +		return frozen;
> +	if (strcmp(path, freezing) == 0)
> +		return freezing;
> +	if (strcmp(path, thawed) == 0)
> +		return thawed;
> +
> +	pr_err("Unknown freezer state: %s", path);
> +err:
> +	return NULL;
> +}
> +
> +static bool freezer_thawed;
> +
> +static int freezer_restore_state(void)
> +{
> +	int fd;
> +	char path[PATH_MAX];
> +
> +	if (!opts.freeze_cgroup || freezer_thawed)
> +		return 0;
> +
> +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
> +	fd = open(path, O_RDWR);
> +	if (fd < 0) {
> +		pr_perror("Unable to open %s", path);
> +		return -1;
> +	}
> +
> +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
> +			pr_perror("Unable to freeze tasks");
> +			close(fd);
> +			return -1;
> +	}
> +	close(fd);
> +	return 0;
> +}
> +
> +static int freeze_processes(void)
> +{
> +	int i, ret, fd, exit_code = -1;
> +	char path[PATH_MAX];
> +	const char *state = thawed;
> +	FILE *f;
> +
> +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
> +	fd = open(path, O_RDWR);
> +	if (fd < 0) {
> +		pr_perror("Unable to open %s", path);
> +		return -1;
> +	}
> +	state = get_freezer_state(fd);
> +	if (!state) {
> +		close(fd);
> +		return -1;
> +	}
> +	if (state == thawed)
> +		freezer_thawed = true;
> +
> +	lseek(fd, 0, SEEK_SET);
> +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
> +		pr_perror("Unable to freeze tasks");
> +		close(fd);
> +		return -1;
> +	}
> +
> +	/*
> +	 * There is not way to wait a specified state, so we need to poll the
> +	 * freezer.state
> +	 */
> +	for (i = 0; i < NR_ATTEMPTS; i++) {
> +		struct timespec req = {};
> +
> +		snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
> +		f = fopen(path, "r");
> +		if (f == NULL) {
> +			pr_perror("Unable to open %s", path);
> +			goto err;
> +		}
> +		while (fgets(path, sizeof(path), f)) {
> +			pid_t pid;
> +
> +			pid = atoi(path);
> +
> +			ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
> +			if (ret == 0) /* skip already seized tasks */
> +				continue;
> +			if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE)) {
> +				fclose(f);
> +				goto err;
> +			}
> +		}
> +		fclose(f);
> +
> +		if (state == frozen)
> +			break;
> +
> +		state = get_freezer_state(fd);
> +		if (!state)
> +			goto err;
> +
> +		if (state == frozen) {
> +			/*
> +			 * Enumerate all tasks one more time to collect all new
> +			 * tasks, which can be born while the cgroup is being frozen.
> +			 */
> +
> +			continue;
> +		}
> +
> +		req.tv_nsec = 10000000 * i;
> +		nanosleep(&req, NULL);
> +	}
> +
> +	if (i == NR_ATTEMPTS) {
> +		pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
> +		goto err;
> +	}
> +
> +	exit_code = 0;
> +err:
> +	if (exit_code == 0 || freezer_thawed) {
> +		lseek(fd, 0, SEEK_SET);
> +		if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
> +			pr_perror("Unable to thaw tasks");
> +			exit_code = -1;
> +		}
> +	}

I guess you don't need to poll freezer.state here, because there aren't
further events that are dependent on the transition.

> +	if (close(fd)) {
> +		pr_perror("Unable to thaw tasks");
> +		return -1;
> +	}
> +
> +	return exit_code;
> +}
> +
>  static inline bool child_collected(struct pstree_item *i, pid_t pid)
>  {
>  	struct pstree_item *c;
> @@ -33,7 +190,9 @@ static int collect_task(struct pstree_item *item);
>  static int collect_children(struct pstree_item *item)
>  {
>  	pid_t *ch;
> -	int ret, i, nr_children, nr_inprogress;
> +	int ret, i, nr_children, nr_inprogress, freeze;
> +
> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>  
>  	ret = parse_children(item->pid.real, &ch, &nr_children);
>  	if (ret < 0)
> @@ -58,7 +217,7 @@ static int collect_children(struct pstree_item *item)
>  			goto free;
>  		}
>  
> -		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
> +		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
>  		if (ret < 0) {
>  			/*
>  			 * Here is a race window between parse_children() and seize(),
> @@ -143,6 +302,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st)
>  {
>  	struct pstree_item *item = root_item;
>  
> +	if (st != TASK_DEAD)
> +		freezer_restore_state();
> +
>  	pr_info("Unfreezing tasks into %d\n", st);
>  	for_each_pstree_item(item)
>  		unseize_task_and_threads(item, st);
> @@ -174,7 +336,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
>  static int collect_threads(struct pstree_item *item)
>  {
>  	struct pid *threads = NULL;
> -	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
> +	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
> +
> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>  
>  	ret = parse_threads(item->pid.real, &threads, &nr_threads);
>  	if (ret < 0)
> @@ -207,7 +371,7 @@ static int collect_threads(struct pstree_item *item)
>  		pr_info("\tSeizing %d's %d thread\n",
>  				item->pid.real, pid);
>  
> -		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
> +		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
>  		if (ret < 0) {
>  			/*
>  			 * Here is a race window between parse_threads() and seize(),
> @@ -251,6 +415,9 @@ static int collect_loop(struct pstree_item *item,
>  {
>  	int attempts = NR_ATTEMPTS, nr_inprogress = 1;
>  
> +	if (opts.freeze_cgroup)
> +		attempts = 2; /* double check that we skip nothing */

attempts was 5. You decrease it to 2. How does that provide additional checking?

>  	/*
>  	 * While we scan the proc and seize the children/threads
>  	 * new ones can appear (with clone(CLONE_PARENT) or with
> @@ -307,7 +474,13 @@ err_close:
>  
>  int collect_pstree(pid_t pid)
>  {
> -	int ret;
> +	int ret, freeze = PTRACE_FREEZE;
> +
> +	if (opts.freeze_cgroup) {
> +		if (freeze_processes())
> +			return -1;
> +		freeze = PTRACE_FREEZE_POST;
> +	}
>  
>  	timing_start(TIME_FREEZING);
>  
> @@ -316,7 +489,7 @@ int collect_pstree(pid_t pid)
>  		return -1;
>  
>  	root_item->pid.real = pid;
> -	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds);
> +	ret = seize_task(pid, -1, &dmpi(root_item)->pi_creds, freeze);
>  	if (ret < 0)
>  		goto err;
>  	pr_info("Seized task %d, state %d\n", pid, ret);
> 

-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project