[CRIU] [PATCH 2/7] cgroup: add support for cgroup namespaces

Pavel Emelyanov xemul at virtuozzo.com
Thu Feb 18 12:40:44 PST 2016


> @@ -997,7 +1004,7 @@ static int userns_move(void *arg, int fd, pid_t pid)
>  	return 0;
>  }
>  
> -static int move_in_cgroup(CgSetEntry *se)
> +static int move_in_cgroup(CgSetEntry *se, bool do_cgns_set)

If I'm not mistaken the do_cgns_set is always true.

>  {
>  	int i;
>  
> @@ -1023,6 +1030,26 @@ static int move_in_cgroup(CgSetEntry *se)
>  
>  		aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
>  
> +		if (do_cgns_set && ce->cgns_prefix) {
> +			pr_info("setting cgns prefix to %s\n", ce->cgns_prefix);
> +			snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->cgns_prefix);
> +			if (userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1) < 0) {

Why do we need to do move here? The 2nd would occur several lines below anyway.

> +				pr_perror("couldn't set cgns prefix %s", aux);
> +				return -1;
> +			}
> +
> +			if (unshare(CLONE_NEWCGROUP) < 0) {
> +				pr_perror("couldn't unshare cgns");
> +				return -1;
> +			}
> +		}
> +
> +		/* Note that unshare(CLONE_NEWCGROUP) doesn't change the view
> +		 * of previously mounted cgroupfses; since we're restoring via
> +		 * a dirfd pointing to the cg yard set up by when criu was in
> +		 * the root cgns, we still want to use the full path here when
> +		 * we move into the cgroup.
> +		 */
>  		snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
>  		pr_debug("  `-> %s\n", aux);
>  		err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);

> @@ -727,8 +728,23 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
>  	if (ret)
>  		goto err;
>  
> +	/* If this is the root task and it has a cgroup ns id, it could be in
> +	 * a cgroup namespace and we should try to figure out the prefix. Or,
> +	 * if the task is not the parent task and its cgroup namespace differs
> +	 * from its parent's, this is a nested cgns and we should compute the
> +	 * prefix.
> +	 */
> +	if ((!item->parent && item->ids->has_cgroup_ns_id) ||

The has_cgroup_ns_id is always true on dump.

> +			(item->parent->ids->has_cgroup_ns_id &&
> +			 item->ids->cgroup_ns_id != item->parent->ids->cgroup_ns_id)) {
> +		info = &cgroup_args;
> +		ret = parasite_dump_cgroup(ctl, &cgroup_args);
> +		if (ret)
> +			goto err;
> +	}
> +
>  	core->tc->has_cg_set = true;
> -	ret = dump_task_cgroup(item, &core->tc->cg_set);
> +	ret = dump_task_cgroup(item, &core->tc->cg_set, info);
>  	if (ret)
>  		goto err;
>  

> @@ -245,6 +246,16 @@ struct parasite_tty_args {
>  	int	st_excl;
>  };
>  
> +struct parasite_dump_cgroup_args {
> +	/* We choose PAGE_SIZE here since that's how big parasite messages are,
> +	 * although this is probably longer than any /proc/pid/cgroup file will
> +	 * ever be on most systems (4k).
> +	 *
> +	 * The string is null terminated.
> +	 */
> +	char contents[PAGE_SIZE];

This would contain the path to cgroup as task sees it. Why not keep here the
path's lenght, the full path would anyway be found out when dumping cgroups
themselves?

> +};
> +
>  /* the parasite prefix is added by gen_offsets.sh */
>  #define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
>  

> @@ -503,6 +511,41 @@ static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
>  }
>  #endif
>  
> +static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args)
> +{
> +	int proc, cgroup, len;
> +
> +	proc = get_proc_fd();

Isn't it too heavyweight (mkdir + mount + open + umount) for just opening the /proc?

> +	if (proc < 0) {
> +		pr_err("can't get /proc fd\n");
> +		return -1;
> +	}
> +
> +	cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0);
> +	sys_close(proc);
> +	if (cgroup < 0) {
> +		pr_err("can't get /proc/self/cgroup fd\n");
> +		sys_close(cgroup);
> +		return -1;
> +	}
> +
> +	len = sys_read(cgroup, args->contents, sizeof(args->contents));
> +	sys_close(cgroup);
> +	if (len < 0) {
> +		pr_err("can't read /proc/self/cgroup %d\n", len);
> +		return -1;
> +	}
> +
> +	if (len == sizeof(*args)) {
> +		pr_warn("/proc/self/cgroup was bigger than the page size\n");
> +		return -1;
> +	}
> +
> +	/* null terminate */
> +	args->contents[len] = 0;
> +	return 0;
> +}
> +
>  static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
>  {
>  	struct ctl_msg m;

> @@ -2158,13 +2158,8 @@ int parse_threads(int pid, struct pid **_t, int *_n)
>  	return 0;
>  }
>  
> -int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> +int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)

This one becomes static, doesn't it?

>  {
> -	FILE *f;
> -
> -	f = fopen_proc(pid, "cgroup");
> -	if (f == NULL)
> -		return -1;
>  	while (fgets(buf, BUF_SIZE, f)) {
>  		struct cg_ctl *ncc, *cc;
>  		char *name, *path = NULL, *e;
> @@ -2195,6 +2190,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
>  
>  		ncc->name = xstrdup(name);
>  		ncc->path = xstrdup(path);
> +		ncc->cgns_prefix = NULL;
>  		if (!ncc->name || !ncc->path) {
>  			xfree(ncc->name);
>  			xfree(ncc->path);
> @@ -2210,20 +2206,97 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
>  		(*n)++;
>  	}
>  
> -	fclose(f);
>  	return 0;
>  
>  err:
>  	put_ctls(retl);
> -	fclose(f);
>  	return -1;
>  }
>  
> +int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n)
> +{
> +	FILE *f;
> +	int ret;
> +	LIST_HEAD(internal);
> +	unsigned int n_internal;
> +	struct cg_ctl *intern, *ext;
> +
> +	f = fopen_proc(pid, "cgroup");
> +	if (!f) {
> +		pr_perror("couldn't open task cgroup file");
> +		return -1;
> +	}
> +
> +	ret = parse_cgroup_file(f, retl, n);
> +	fclose(f);
> +	if (ret < 0)
> +		return -1;
> +
> +	/* No parasite args, we're dumping criu's cg set, so we don't need to
> +	 * try and parse the "internal" cgroup set to find namespace
> +	 * boundaries.
> +	 */
> +	if (!args)
> +		return 0;
> +
> +	f = fmemopen(args->contents, strlen(args->contents), "r");
> +	if (!f) {
> +		pr_perror("couldn't fmemopen cgroup buffer:\n%s\n", args->contents);
> +		return -1;
> +	}
> +
> +	ret = parse_cgroup_file(f, &internal, &n_internal);
> +	fclose(f);
> +	if (ret < 0) {
> +		pr_err("couldn't parse internal cgroup file\n");
> +		return -1;
> +	}
> +
> +	list_for_each_entry(intern, &internal, l) {
> +		list_for_each_entry(ext, retl, l) {
> +			char *pos, tmp;
> +
> +			if (strcmp(ext->name, intern->name))
> +				continue;
> +
> +			/* If the cgroup namespace was unshared at / (or there
> +			 * is no cgroup namespace relative to criu), the paths
> +			 * are equal and we don't need to set a prefix.
> +			 */
> +			if (!strcmp(ext->path, intern->path))
> +				continue;
> +
> +			/* +1 here to chop off the leading / */
> +			pos = ext->path + strlen(ext->path) - strlen(intern->path+1);
> +			if (strcmp(pos, intern->path+1)) {
> +				pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path);
> +				ret = -1;
> +				goto out;
> +			}
> +
> +			tmp = *pos;
> +			*pos = '\0';
> +			ext->cgns_prefix = xstrdup(ext->path);
> +			if (!ext->cgns_prefix) {
> +				ret = -1;
> +				goto out;
> +			}
> +			*pos = tmp;
> +		}
> +	}

Erm... I'm not sure I fully understand what's going on in this routine.
Would you comment on it please?

> +
> +out:
> +	put_ctls(&internal);
> +	return ret;
> +}
> +
>  void put_ctls(struct list_head *l)
>  {
>  	struct cg_ctl *c, *n;
>  
>  	list_for_each_entry_safe(c, n, l, l) {
> +		if (c->cgns_prefix)
> +			xfree(c->cgns_prefix);
>  		xfree(c->name);
>  		xfree(c->path);
>  		xfree(c);
> diff --git a/images/cgroup.proto b/images/cgroup.proto
> index dcd2fe8..f255c8c 100644
> --- a/images/cgroup.proto
> +++ b/images/cgroup.proto
> @@ -23,8 +23,9 @@ message cg_controller_entry {
>  }
>  
>  message cg_member_entry {
> -	required string name	= 1;
> -	required string path	= 2;
> +	required string name		= 1;
> +	required string path		= 2;
> +	optional string cgns_prefix	= 3;

Same as for parasite args -- maybe it's more efficient to to save prefix lenght as int here?

>  }
>  
>  message cg_set_entry {
> diff --git a/images/core.proto b/images/core.proto
> index 6def5d9..824ee26 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -57,6 +57,7 @@ message task_kobj_ids_entry {
>  	optional uint32			uts_ns_id	= 8;
>  	optional uint32			mnt_ns_id	= 9;
>  	optional uint32			user_ns_id	= 10;
> +	optional uint32			cgroup_ns_id	= 11;
>  }
>  
>  message thread_sas_entry {
> 



More information about the CRIU mailing list