[CRIU] [PATCH 2/7] cgroup: add support for cgroup namespaces

Fri Feb 19 08:02:57 PST 2016

On Thu, Feb 18, 2016 at 11:40:44PM +0300, Pavel Emelyanov wrote:
> 
> > @@ -997,7 +1004,7 @@ static int userns_move(void *arg, int fd, pid_t pid)
> >  	return 0;
> >  }
> >  
> > -static int move_in_cgroup(CgSetEntry *se)
> > +static int move_in_cgroup(CgSetEntry *se, bool do_cgns_set)
> 
> If I'm not mistaken the do_cgns_set is always true.

Ah, perhaps this is left over from a previous version, I'll drop it.

> >  {
> >  	int i;
> >  
> > @@ -1023,6 +1030,26 @@ static int move_in_cgroup(CgSetEntry *se)
> >  
> >  		aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
> >  
> > +		if (do_cgns_set && ce->cgns_prefix) {
> > +			pr_info("setting cgns prefix to %s\n", ce->cgns_prefix);
> > +			snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->cgns_prefix);
> > +			if (userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1) < 0) {
> 
> Why do we need to do move here? The 2nd would occur several lines below anyway.

Consider a task in a cgroup (according to the host):

/unsprefix/insidecontainer

If the task first moved itself into /unsprefix, then did unshare(),
when the task examines its own /proc/self/cgroup file it will see /,
but to the host it is really in /unsprefix. Then if it further enters
/insidecontainer here, the full host path will be
/unsprefix/insidecontianer. There is no way to say "set the cgroup
namespace boundary at /unsprefix" without first entering that, doing
the unshare, and then entering the rest of the path.

> > +				pr_perror("couldn't set cgns prefix %s", aux);
> > +				return -1;
> > +			}
> > +
> > +			if (unshare(CLONE_NEWCGROUP) < 0) {
> > +				pr_perror("couldn't unshare cgns");
> > +				return -1;
> > +			}
> > +		}
> > +
> > +		/* Note that unshare(CLONE_NEWCGROUP) doesn't change the view
> > +		 * of previously mounted cgroupfses; since we're restoring via
> > +		 * a dirfd pointing to the cg yard set up by when criu was in
> > +		 * the root cgns, we still want to use the full path here when
> > +		 * we move into the cgroup.
> > +		 */
> >  		snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
> >  		pr_debug("  `-> %s\n", aux);
> >  		err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
> 
> > @@ -727,8 +728,23 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
> >  	if (ret)
> >  		goto err;
> >  
> > +	/* If this is the root task and it has a cgroup ns id, it could be in
> > +	 * a cgroup namespace and we should try to figure out the prefix. Or,
> > +	 * if the task is not the parent task and its cgroup namespace differs
> > +	 * from its parent's, this is a nested cgns and we should compute the
> > +	 * prefix.
> > +	 */
> > +	if ((!item->parent && item->ids->has_cgroup_ns_id) ||
> 
> The has_cgroup_ns_id is always true on dump.

Only if /proc/self/ns/cgroup exists, on older kernels it will be false
and so we don't need to bother with checking the task's
/proc/self/cgroup from inside.

> > +			(item->parent->ids->has_cgroup_ns_id &&
> > +			 item->ids->cgroup_ns_id != item->parent->ids->cgroup_ns_id)) {
> > +		info = &cgroup_args;
> > +		ret = parasite_dump_cgroup(ctl, &cgroup_args);
> > +		if (ret)
> > +			goto err;
> > +	}
> > +
> >  	core->tc->has_cg_set = true;
> > -	ret = dump_task_cgroup(item, &core->tc->cg_set);
> > +	ret = dump_task_cgroup(item, &core->tc->cg_set, info);
> >  	if (ret)
> >  		goto err;
> >  
> 
> > @@ -245,6 +246,16 @@ struct parasite_tty_args {
> >  	int	st_excl;
> >  };
> >  
> > +struct parasite_dump_cgroup_args {
> > +	/* We choose PAGE_SIZE here since that's how big parasite messages are,
> > +	 * although this is probably longer than any /proc/pid/cgroup file will
> > +	 * ever be on most systems (4k).
> > +	 *
> > +	 * The string is null terminated.
> > +	 */
> > +	char contents[PAGE_SIZE];
> 
> This would contain the path to cgroup as task sees it. Why not keep here the
> path's lenght, the full path would anyway be found out when dumping cgroups
> themselves?

I can do that too, mostly I just didn't want to move the cgroup
parsing code to pie :). This will take some goofy memory mapping
stuff, since we need to have a map from controller => path length, but
I think that's ok.

> > +};
> > +
> >  /* the parasite prefix is added by gen_offsets.sh */
> >  #define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
> >  
> 
> > @@ -503,6 +511,41 @@ static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
> >  }
> >  #endif
> >  
> > +static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args)
> > +{
> > +	int proc, cgroup, len;
> > +
> > +	proc = get_proc_fd();
> 
> Isn't it too heavyweight (mkdir + mount + open + umount) for just opening the /proc?

How else will you do it? You need to be sure that the /proc you're
looking at is the right one for the task.

> > +	if (proc < 0) {
> > +		pr_err("can't get /proc fd\n");
> > +		return -1;
> > +	}
> > +
> > +	cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0);
> > +	sys_close(proc);
> > +	if (cgroup < 0) {
> > +		pr_err("can't get /proc/self/cgroup fd\n");
> > +		sys_close(cgroup);
> > +		return -1;
> > +	}
> > +
> > +	len = sys_read(cgroup, args->contents, sizeof(args->contents));
> > +	sys_close(cgroup);
> > +	if (len < 0) {
> > +		pr_err("can't read /proc/self/cgroup %d\n", len);
> > +		return -1;
> > +	}
> > +
> > +	if (len == sizeof(*args)) {
> > +		pr_warn("/proc/self/cgroup was bigger than the page size\n");
> > +		return -1;
> > +	}
> > +
> > +	/* null terminate */
> > +	args->contents[len] = 0;
> > +	return 0;
> > +}
> > +
> >  static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
> >  {
> >  	struct ctl_msg m;
> 
> > @@ -2158,13 +2158,8 @@ int parse_threads(int pid, struct pid **_t, int *_n)
> >  	return 0;
> >  }
> >  
> > -int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> > +int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
> 
> This one becomes static, doesn't it?

Yes, good catch.

> >  {
> > -	FILE *f;
> > -
> > -	f = fopen_proc(pid, "cgroup");
> > -	if (f == NULL)
> > -		return -1;
> >  	while (fgets(buf, BUF_SIZE, f)) {
> >  		struct cg_ctl *ncc, *cc;
> >  		char *name, *path = NULL, *e;
> > @@ -2195,6 +2190,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> >  
> >  		ncc->name = xstrdup(name);
> >  		ncc->path = xstrdup(path);
> > +		ncc->cgns_prefix = NULL;
> >  		if (!ncc->name || !ncc->path) {
> >  			xfree(ncc->name);
> >  			xfree(ncc->path);
> > @@ -2210,20 +2206,97 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> >  		(*n)++;
> >  	}
> >  
> > -	fclose(f);
> >  	return 0;
> >  
> >  err:
> >  	put_ctls(retl);
> > -	fclose(f);
> >  	return -1;
> >  }
> >  
> > +int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n)
> > +{
> > +	FILE *f;
> > +	int ret;
> > +	LIST_HEAD(internal);
> > +	unsigned int n_internal;
> > +	struct cg_ctl *intern, *ext;
> > +
> > +	f = fopen_proc(pid, "cgroup");
> > +	if (!f) {
> > +		pr_perror("couldn't open task cgroup file");
> > +		return -1;
> > +	}
> > +
> > +	ret = parse_cgroup_file(f, retl, n);
> > +	fclose(f);
> > +	if (ret < 0)
> > +		return -1;
> > +
> > +	/* No parasite args, we're dumping criu's cg set, so we don't need to
> > +	 * try and parse the "internal" cgroup set to find namespace
> > +	 * boundaries.
> > +	 */
> > +	if (!args)
> > +		return 0;
> > +
> > +	f = fmemopen(args->contents, strlen(args->contents), "r");
> > +	if (!f) {
> > +		pr_perror("couldn't fmemopen cgroup buffer:\n%s\n", args->contents);
> > +		return -1;
> > +	}
> > +
> > +	ret = parse_cgroup_file(f, &internal, &n_internal);
> > +	fclose(f);
> > +	if (ret < 0) {
> > +		pr_err("couldn't parse internal cgroup file\n");
> > +		return -1;
> > +	}
> > +
> > +	list_for_each_entry(intern, &internal, l) {
> > +		list_for_each_entry(ext, retl, l) {
> > +			char *pos, tmp;
> > +
> > +			if (strcmp(ext->name, intern->name))
> > +				continue;
> > +
> > +			/* If the cgroup namespace was unshared at / (or there
> > +			 * is no cgroup namespace relative to criu), the paths
> > +			 * are equal and we don't need to set a prefix.
> > +			 */
> > +			if (!strcmp(ext->path, intern->path))
> > +				continue;
> > +
> > +			/* +1 here to chop off the leading / */
> > +			pos = ext->path + strlen(ext->path) - strlen(intern->path+1);
> > +			if (strcmp(pos, intern->path+1)) {
> > +				pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path);
> > +				ret = -1;
> > +				goto out;
> > +			}
> > +
> > +			tmp = *pos;
> > +			*pos = '\0';
> > +			ext->cgns_prefix = xstrdup(ext->path);
> > +			if (!ext->cgns_prefix) {
> > +				ret = -1;
> > +				goto out;
> > +			}
> > +			*pos = tmp;
> > +		}
> > +	}
> 
> Erm... I'm not sure I fully understand what's going on in this routine.
> Would you comment on it please?

Sure, I can add some comments. Something like:

/* Here's where we actually compute the cgns prefix. Consider a task
 * in /foo/bar which has unshared its namespace at /foo. The internal
 * path is /bar, but the external path is /foo/bar, and the cgns
 * prefix is /foo. The algorithm is:
 *
 * // no cg ns unshare in this case
 * if (internal == external)
 *   continue;
 * idx = find_suffix_pos(external, internal)
 * cgns_prefix = external[:idx]
 */

Does that make sense?

> > +
> > +out:
> > +	put_ctls(&internal);
> > +	return ret;
> > +}
> > +
> >  void put_ctls(struct list_head *l)
> >  {
> >  	struct cg_ctl *c, *n;
> >  
> >  	list_for_each_entry_safe(c, n, l, l) {
> > +		if (c->cgns_prefix)
> > +			xfree(c->cgns_prefix);
> >  		xfree(c->name);
> >  		xfree(c->path);
> >  		xfree(c);
> > diff --git a/images/cgroup.proto b/images/cgroup.proto
> > index dcd2fe8..f255c8c 100644
> > --- a/images/cgroup.proto
> > +++ b/images/cgroup.proto
> > @@ -23,8 +23,9 @@ message cg_controller_entry {
> >  }
> >  
> >  message cg_member_entry {
> > -	required string name	= 1;
> > -	required string path	= 2;
> > +	required string name		= 1;
> > +	required string path		= 2;
> > +	optional string cgns_prefix	= 3;
> 
> Same as for parasite args -- maybe it's more efficient to to save prefix lenght as int here?

Yep, I can make that change.

Thanks for looking!

Tycho