[CRIU] [PATCH 3/3] dump: use freezer cgroup to seize processes (v2)

Tue Aug 4 06:26:36 PDT 2015

On 08/04/2015 08:06 AM, Andrew Vagin wrote:
> On Tue, Aug 04, 2015 at 07:50:46AM -0400, Christopher Covington wrote:
>> On 08/03/2015 10:18 PM, Andrey Vagin wrote:
>>> Without using a freezer cgroup, we need to do a few iterations to catch
>>> all tasks, because a new tasks can be born. If new tasks appear faster
>>> than criu collects them, criu fails. The freezer cgroup allows to
>>> solve this problem.
>>>
>>> We freeze the freezer group, then attaches to tasks with ptrace and thaw
>>> the freezer cgroup. We suppose that all tasks which are going to be
>>> dumped in a specified freezer group.
>>>
>>> v2: fix comments from Christopher
>>>
>>> Cc: Christopher Covington <cov at codeaurora.org>
>>> Signed-off-by: Andrey Vagin <avagin at openvz.org>
>>> ---
>>>  cr-exec.c            |   2 +-
>>>  crtools.c            |   6 ++
>>>  include/cr_options.h |   1 +
>>>  include/ptrace.h     |   5 +-
>>>  proc_parse.c         |   1 +
>>>  ptrace.c             |  54 +++++++++------
>>>  seize.c              | 185 +++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  7 files changed, 226 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/cr-exec.c b/cr-exec.c
>>> index f3d55f6..b149c38 100644
>>> --- a/cr-exec.c
>>> +++ b/cr-exec.c
>>> @@ -130,7 +130,7 @@ int cr_exec(int pid, char **opt)
>>>  		goto out;
>>>  	}
>>>  
>>> -	prev_state = ret = seize_task(pid, -1, &creds);
>>> +	prev_state = ret = seize_task(pid, -1, &creds, PTRACE_FREEZE);
>>>  	if (ret < 0) {
>>>  		pr_err("Can't seize task %d\n", pid);
>>>  		goto out;
>>> diff --git a/crtools.c b/crtools.c
>>> index 6af6080..9096420 100644
>>> --- a/crtools.c
>>> +++ b/crtools.c
>>> @@ -235,6 +235,7 @@ int main(int argc, char *argv[], char *envp[])
>>>  		{ "enable-fs",			required_argument,	0, 1065 },
>>>  		{ "enable-external-sharing", 	no_argument, 		0, 1066 },
>>>  		{ "enable-external-masters", 	no_argument, 		0, 1067 },
>>> +		{ "freeze-cgroup",		required_argument,	0, 1068 },
>>>  		{ },
>>>  	};
>>>  
>>> @@ -465,6 +466,9 @@ int main(int argc, char *argv[], char *envp[])
>>>  		case 1067:
>>>  			opts.enable_external_masters = true;
>>>  			break;
>>> +		case 1068:
>>> +			opts.freeze_cgroup = optarg;
>>> +			break;
>>>  		case 'M':
>>>  			{
>>>  				char *aux;
>>> @@ -676,6 +680,8 @@ usage:
>>>  "                        'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
>>>  "     --exec-cmd         execute the command specified after '--' on successful\n"
>>>  "                        restore making it the parent of the restored process\n"
>>> +"  --freeze-cgroup\n"
>>> +"                        use cgroup freezer to collect processes\n"
>>>  "\n"
>>>  "* Special resources support:\n"
>>>  "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
>>> diff --git a/include/cr_options.h b/include/cr_options.h
>>> index 19c2f77..f981806 100644
>>> --- a/include/cr_options.h
>>> +++ b/include/cr_options.h
>>> @@ -57,6 +57,7 @@ struct cr_options {
>>>  	char			*output;
>>>  	char			*root;
>>>  	char			*pidfile;
>>> +	char			*freeze_cgroup;
>>>  	struct list_head	veth_pairs;
>>>  	struct list_head	scripts;
>>>  	struct list_head	ext_mounts;
>>> diff --git a/include/ptrace.h b/include/ptrace.h
>>> index 44b66c9..ce91a45 100644
>>> --- a/include/ptrace.h
>>> +++ b/include/ptrace.h
>>> @@ -67,7 +67,10 @@ struct ptrace_peeksiginfo_args {
>>>  
>>>  #define SI_EVENT(_si_code)	(((_si_code) & 0xFFFF) >> 8)
>>>  
>>> -extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
>>> +#define PTRACE_FREEZE_PRE	0 /* Attach to the process (PTRACE_SEIZE and PTRACE_INTERRUPT) */
>>> +#define PTRACE_FREEZE_POST	1 /* Wait when the process will be stopped  */
>>> +#define PTRACE_FREEZE		2 /* Do both stages together */
>>> +extern int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze);
>>>  extern int suspend_seccomp(pid_t pid);
>>>  extern int unseize_task(pid_t pid, int orig_state, int state);
>>>  extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
>>> diff --git a/proc_parse.c b/proc_parse.c
>>> index 95d6505..ea98a97 100644
>>> --- a/proc_parse.c
>>> +++ b/proc_parse.c
>>> @@ -1135,6 +1135,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
>>>  
>>>  		new->nsid = nsid;
>>>  
>>> +		pr_debug("%s\n", str);
>>>  		ret = parse_mountinfo_ent(str, new, &fsname);
>>>  		if (ret < 0) {
>>>  			pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
>>> diff --git a/ptrace.c b/ptrace.c
>>> index 905eaec..e614be4 100644
>>> --- a/ptrace.c
>>> +++ b/ptrace.c
>>> @@ -60,11 +60,11 @@ int suspend_seccomp(pid_t pid)
>>>   * up with someone else.
>>>   */
>>>  
>>> -int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>> +int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds, int freeze)
>>>  {
>>>  	siginfo_t si;
>>>  	int status;
>>> -	int ret, ret2, ptrace_errno, wait_errno = 0;
>>> +	int ret = 0, ret2, ptrace_errno = 0, wait_errno = 0;
>>>  	struct proc_status_creds cr;
>>>  
>>>  	/*
>>> @@ -72,23 +72,25 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>>  	 */
>>>  	memzero(&cr, sizeof(struct proc_status_creds));
>>>  
>>> -	ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
>>> -	ptrace_errno = errno;
>>> -	if (ret == 0) {
>>> -		/*
>>> -		 * If we SEIZE-d the task stop it before going
>>> -		 * and reading its stat from proc. Otherwise task
>>> -		 * may die _while_ we're doing it and we'll have
>>> -		 * inconsistent seize/state pair.
>>> -		 *
>>> -		 * If task dies after we seize it but before we
>>> -		 * do this interrupt, we'll notice it via proc.
>>> -		 */
>>> -		ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
>>> -		if (ret < 0) {
>>> -			pr_perror("SEIZE %d: can't interrupt task", pid);
>>> -			ptrace(PTRACE_DETACH, pid, NULL, NULL);
>>> -			goto err;
>>> +	if (freeze != PTRACE_FREEZE_POST) {
>>> +		ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
>>> +		ptrace_errno = errno;
>>> +		if (ret == 0) {
>>> +			/*
>>> +			 * If we SEIZE-d the task stop it before going
>>> +			 * and reading its stat from proc. Otherwise task
>>> +			 * may die _while_ we're doing it and we'll have
>>> +			 * inconsistent seize/state pair.
>>> +			 *
>>> +			 * If task dies after we seize it but before we
>>> +			 * do this interrupt, we'll notice it via proc.
>>> +			 */
>>> +			ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
>>> +			if (ret < 0) {
>>> +				pr_perror("SEIZE %d: can't interrupt task", pid);
>>> +				ptrace(PTRACE_DETACH, pid, NULL, NULL);
>>> +				goto err;
>>> +			}
>>>  		}
>>>  	}
>>>  
>>> @@ -101,8 +103,18 @@ int seize_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
>>>  
>>>  try_again:
>>>  	if (!ret) {
>>> -		ret = wait4(pid, &status, __WALL, NULL);
>>> +		int flags = 0;
>>> +
>>> +		if (freeze == PTRACE_FREEZE_PRE)
>>> +			flags |= WNOHANG;
>>> +
>>> +		ret = wait4(pid, &status, __WALL | flags, NULL);
>>>  		wait_errno = errno;
>>> +
>>> +		if (freeze == PTRACE_FREEZE_PRE && ret > 0) {
>>> +			pr_err("The %d task is not frozen\n", pid);
>>> +			goto err;
>>> +		}
>>>  	}
>>>  
>>>  	ret2 = parse_pid_status(pid, &cr);
>>> @@ -126,6 +138,8 @@ try_again:
>>>  
>>>  		return TASK_DEAD;
>>>  	}
>>> +	if (freeze == PTRACE_FREEZE_PRE)
>>> +		return 0;
>>>  
>>>  	if ((ppid != -1) && (cr.ppid != ppid)) {
>>>  		pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
>>> diff --git a/seize.c b/seize.c
>>> index e9be332..6ed664f 100644
>>> --- a/seize.c
>>> +++ b/seize.c
>>> @@ -18,6 +18,163 @@
>>>  
>>>  #define NR_ATTEMPTS 5
>>
>> There appears to be an identical definition in cr-dump.c. Perhaps they can be
>> consolidated into a single definition in a common header?
> 
> I moved a part of cr-dump.c to seize.c in the first patch.
> 
>>
>>> +const char frozen[]	= "FROZEN";
>>> +const char freezing[]	= "FREEZING";
>>> +const char thawed[]	= "THAWED";
>>> +
>>> +static const char *get_freezer_state(int fd)
>>> +{
>>> +	int ret;
>>> +	char path[PATH_MAX];
>>> +
>>> +	lseek(fd, 0, SEEK_SET);
>>> +	ret = read(fd, path, sizeof(path) - 1);
>>> +	if (ret <= 0) {
>>
>> I don't know if this could happen on cgroupsfs:
>> should you loop if 0 < ret < sizeof(path) - 1?
> 
> Currently, it isn't required. And I suppose that this code will always
> work. But theoretically you are right.
> 
> 
>>
>>> +		pr_perror("Unable to get a current state");
>>> +		goto err;
>>> +	}
>>> +	if (path[ret - 1] == '\n')
>>> +		path[ret - 1] = 0;
>>> +	else
>>> +		path[ret] = 0;
>>> +
>>> +	pr_debug("freezer.state=%s\n", path);
>>> +	if (strcmp(path, frozen) == 0)
>>> +		return frozen;
>>> +	if (strcmp(path, freezing) == 0)
>>> +		return freezing;
>>> +	if (strcmp(path, thawed) == 0)
>>> +		return thawed;
>>> +
>>> +	pr_err("Unknown freezer state: %s", path);
>>> +err:
>>> +	return NULL;
>>> +}
>>> +
>>> +static bool freezer_thawed;
>>> +
>>> +static int freezer_restore_state(void)
>>> +{
>>> +	int fd;
>>> +	char path[PATH_MAX];
>>> +
>>> +	if (!opts.freeze_cgroup || freezer_thawed)
>>> +		return 0;
>>> +
>>> +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
>>> +	fd = open(path, O_RDWR);
>>> +	if (fd < 0) {
>>> +		pr_perror("Unable to open %s", path);
>>> +		return -1;
>>> +	}
>>> +
>>> +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
>>> +			pr_perror("Unable to freeze tasks");
>>> +			close(fd);
>>> +			return -1;
>>> +	}
>>> +	close(fd);
>>> +	return 0;
>>> +}
>>> +
>>> +static int freeze_processes(void)
>>> +{
>>> +	int i, ret, fd, exit_code = -1;
>>> +	char path[PATH_MAX];
>>> +	const char *state = thawed;
>>> +	FILE *f;
>>> +
>>> +	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
>>> +	fd = open(path, O_RDWR);
>>> +	if (fd < 0) {
>>> +		pr_perror("Unable to open %s", path);
>>> +		return -1;
>>> +	}
>>> +	state = get_freezer_state(fd);
>>> +	if (!state) {
>>> +		close(fd);
>>> +		return -1;
>>> +	}
>>> +	if (state == thawed)
>>> +		freezer_thawed = true;
>>> +
>>> +	lseek(fd, 0, SEEK_SET);
>>> +	if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
>>> +		pr_perror("Unable to freeze tasks");
>>> +		close(fd);
>>> +		return -1;
>>> +	}
>>> +
>>> +	/*
>>> +	 * There is not way to wait a specified state, so we need to poll the
>>> +	 * freezer.state
>>> +	 */
>>> +	for (i = 0; i < NR_ATTEMPTS; i++) {
>>> +		struct timespec req = {};
>>> +
>>> +		snprintf(path, sizeof(path), "%s/tasks", opts.freeze_cgroup);
>>> +		f = fopen(path, "r");
>>> +		if (f == NULL) {
>>> +			pr_perror("Unable to open %s", path);
>>> +			goto err;
>>> +		}
>>> +		while (fgets(path, sizeof(path), f)) {
>>> +			pid_t pid;
>>> +
>>> +			pid = atoi(path);
>>> +
>>> +			ret = wait4(pid, NULL, __WALL | WNOHANG, NULL);
>>> +			if (ret == 0) /* skip already seized tasks */
>>> +				continue;
>>> +			if (seize_task(pid, 0, NULL, PTRACE_FREEZE_PRE)) {
>>> +				fclose(f);
>>> +				goto err;
>>> +			}
>>> +		}
>>> +		fclose(f);
>>> +
>>> +		if (state == frozen)
>>> +			break;
>>> +
>>> +		state = get_freezer_state(fd);
>>> +		if (!state)
>>> +			goto err;
>>> +
>>> +		if (state == frozen) {
>>> +			/*
>>> +			 * Enumerate all tasks one more time to collect all new
>>> +			 * tasks, which can be born while the cgroup is being frozen.
>>> +			 */
>>> +
>>> +			continue;
>>> +		}
>>> +
>>> +		req.tv_nsec = 10000000 * i;
>>> +		nanosleep(&req, NULL);
>>> +	}
>>> +
>>> +	if (i == NR_ATTEMPTS) {
>>> +		pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
>>> +		goto err;
>>> +	}
>>> +
>>> +	exit_code = 0;
>>> +err:
>>> +	if (exit_code == 0 || freezer_thawed) {
>>> +		lseek(fd, 0, SEEK_SET);
>>> +		if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
>>> +			pr_perror("Unable to thaw tasks");
>>> +			exit_code = -1;
>>> +		}
>>> +	}
>>
>> I guess you don't need to poll freezer.state here, because there aren't
>> further events that are dependent on the transition.
> 
> Yes, you are right. I will add a comment here.
> 
>>
>>> +	if (close(fd)) {
>>> +		pr_perror("Unable to thaw tasks");
>>> +		return -1;
>>> +	}
>>> +
>>> +	return exit_code;
>>> +}
>>> +
>>>  static inline bool child_collected(struct pstree_item *i, pid_t pid)
>>>  {
>>>  	struct pstree_item *c;
>>> @@ -33,7 +190,9 @@ static int collect_task(struct pstree_item *item);
>>>  static int collect_children(struct pstree_item *item)
>>>  {
>>>  	pid_t *ch;
>>> -	int ret, i, nr_children, nr_inprogress;
>>> +	int ret, i, nr_children, nr_inprogress, freeze;
>>> +
>>> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>>>  
>>>  	ret = parse_children(item->pid.real, &ch, &nr_children);
>>>  	if (ret < 0)
>>> @@ -58,7 +217,7 @@ static int collect_children(struct pstree_item *item)
>>>  			goto free;
>>>  		}
>>>  
>>> -		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds);
>>> +		ret = seize_task(pid, item->pid.real, &dmpi(c)->pi_creds, freeze);
>>>  		if (ret < 0) {
>>>  			/*
>>>  			 * Here is a race window between parse_children() and seize(),
>>> @@ -143,6 +302,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st)
>>>  {
>>>  	struct pstree_item *item = root_item;
>>>  
>>> +	if (st != TASK_DEAD)
>>> +		freezer_restore_state();
>>> +
>>>  	pr_info("Unfreezing tasks into %d\n", st);
>>>  	for_each_pstree_item(item)
>>>  		unseize_task_and_threads(item, st);
>>> @@ -174,7 +336,9 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
>>>  static int collect_threads(struct pstree_item *item)
>>>  {
>>>  	struct pid *threads = NULL;
>>> -	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
>>> +	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0, freeze;
>>> +
>>> +	freeze = opts.freeze_cgroup ? PTRACE_FREEZE_POST : PTRACE_FREEZE;
>>>  
>>>  	ret = parse_threads(item->pid.real, &threads, &nr_threads);
>>>  	if (ret < 0)
>>> @@ -207,7 +371,7 @@ static int collect_threads(struct pstree_item *item)
>>>  		pr_info("\tSeizing %d's %d thread\n",
>>>  				item->pid.real, pid);
>>>  
>>> -		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
>>> +		ret = seize_task(pid, item_ppid(item), &dmpi(item)->pi_creds, freeze);
>>>  		if (ret < 0) {
>>>  			/*
>>>  			 * Here is a race window between parse_threads() and seize(),
>>> @@ -251,6 +415,9 @@ static int collect_loop(struct pstree_item *item,
>>>  {
>>>  	int attempts = NR_ATTEMPTS, nr_inprogress = 1;
>>>  
>>> +	if (opts.freeze_cgroup)
>>> +		attempts = 2; /* double check that we skip nothing */
>>
>> attempts was 5. You decrease it to 2. How does that provide additional checking?
> 
> On the first attempt, we collect all tasks. On the second attempt, we
> check that we don't find new tasks (nr_inprogress == 0).

Sounds good. If you'd like to add:

Reviewed-by: Christopher Covington <cov at codeaurora.org>

-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project