[CRIU] [PATCH 2/7] cgroup: add support for cgroup namespaces
Tycho Andersen
tycho.andersen at canonical.com
Fri Feb 19 08:02:57 PST 2016
On Thu, Feb 18, 2016 at 11:40:44PM +0300, Pavel Emelyanov wrote:
>
> > @@ -997,7 +1004,7 @@ static int userns_move(void *arg, int fd, pid_t pid)
> > return 0;
> > }
> >
> > -static int move_in_cgroup(CgSetEntry *se)
> > +static int move_in_cgroup(CgSetEntry *se, bool do_cgns_set)
>
> If I'm not mistaken the do_cgns_set is always true.
Ah, perhaps this is left over from a previous version, I'll drop it.
> > {
> > int i;
> >
> > @@ -1023,6 +1030,26 @@ static int move_in_cgroup(CgSetEntry *se)
> >
> > aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
> >
> > + if (do_cgns_set && ce->cgns_prefix) {
> > + pr_info("setting cgns prefix to %s\n", ce->cgns_prefix);
> > + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->cgns_prefix);
> > + if (userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1) < 0) {
>
> Why do we need to do move here? The 2nd would occur several lines below anyway.
Consider a task in a cgroup (according to the host):
/unsprefix/insidecontainer
If the task first moved itself into /unsprefix, then did unshare(),
when the task examines its own /proc/self/cgroup file it will see /,
but to the host it is really in /unsprefix. Then if it further enters
/insidecontainer here, the full host path will be
/unsprefix/insidecontianer. There is no way to say "set the cgroup
namespace boundary at /unsprefix" without first entering that, doing
the unshare, and then entering the rest of the path.
> > + pr_perror("couldn't set cgns prefix %s", aux);
> > + return -1;
> > + }
> > +
> > + if (unshare(CLONE_NEWCGROUP) < 0) {
> > + pr_perror("couldn't unshare cgns");
> > + return -1;
> > + }
> > + }
> > +
> > + /* Note that unshare(CLONE_NEWCGROUP) doesn't change the view
> > + * of previously mounted cgroupfses; since we're restoring via
> > + * a dirfd pointing to the cg yard set up by when criu was in
> > + * the root cgns, we still want to use the full path here when
> > + * we move into the cgroup.
> > + */
> > snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
> > pr_debug(" `-> %s\n", aux);
> > err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
>
> > @@ -727,8 +728,23 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
> > if (ret)
> > goto err;
> >
> > + /* If this is the root task and it has a cgroup ns id, it could be in
> > + * a cgroup namespace and we should try to figure out the prefix. Or,
> > + * if the task is not the parent task and its cgroup namespace differs
> > + * from its parent's, this is a nested cgns and we should compute the
> > + * prefix.
> > + */
> > + if ((!item->parent && item->ids->has_cgroup_ns_id) ||
>
> The has_cgroup_ns_id is always true on dump.
Only if /proc/self/ns/cgroup exists, on older kernels it will be false
and so we don't need to bother with checking the task's
/proc/self/cgroup from inside.
> > + (item->parent->ids->has_cgroup_ns_id &&
> > + item->ids->cgroup_ns_id != item->parent->ids->cgroup_ns_id)) {
> > + info = &cgroup_args;
> > + ret = parasite_dump_cgroup(ctl, &cgroup_args);
> > + if (ret)
> > + goto err;
> > + }
> > +
> > core->tc->has_cg_set = true;
> > - ret = dump_task_cgroup(item, &core->tc->cg_set);
> > + ret = dump_task_cgroup(item, &core->tc->cg_set, info);
> > if (ret)
> > goto err;
> >
>
> > @@ -245,6 +246,16 @@ struct parasite_tty_args {
> > int st_excl;
> > };
> >
> > +struct parasite_dump_cgroup_args {
> > + /* We choose PAGE_SIZE here since that's how big parasite messages are,
> > + * although this is probably longer than any /proc/pid/cgroup file will
> > + * ever be on most systems (4k).
> > + *
> > + * The string is null terminated.
> > + */
> > + char contents[PAGE_SIZE];
>
> This would contain the path to cgroup as task sees it. Why not keep here the
> path's lenght, the full path would anyway be found out when dumping cgroups
> themselves?
I can do that too, mostly I just didn't want to move the cgroup
parsing code to pie :). This will take some goofy memory mapping
stuff, since we need to have a map from controller => path length, but
I think that's ok.
> > +};
> > +
> > /* the parasite prefix is added by gen_offsets.sh */
> > #define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
> >
>
> > @@ -503,6 +511,41 @@ static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
> > }
> > #endif
> >
> > +static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args)
> > +{
> > + int proc, cgroup, len;
> > +
> > + proc = get_proc_fd();
>
> Isn't it too heavyweight (mkdir + mount + open + umount) for just opening the /proc?
How else will you do it? You need to be sure that the /proc you're
looking at is the right one for the task.
> > + if (proc < 0) {
> > + pr_err("can't get /proc fd\n");
> > + return -1;
> > + }
> > +
> > + cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0);
> > + sys_close(proc);
> > + if (cgroup < 0) {
> > + pr_err("can't get /proc/self/cgroup fd\n");
> > + sys_close(cgroup);
> > + return -1;
> > + }
> > +
> > + len = sys_read(cgroup, args->contents, sizeof(args->contents));
> > + sys_close(cgroup);
> > + if (len < 0) {
> > + pr_err("can't read /proc/self/cgroup %d\n", len);
> > + return -1;
> > + }
> > +
> > + if (len == sizeof(*args)) {
> > + pr_warn("/proc/self/cgroup was bigger than the page size\n");
> > + return -1;
> > + }
> > +
> > + /* null terminate */
> > + args->contents[len] = 0;
> > + return 0;
> > +}
> > +
> > static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
> > {
> > struct ctl_msg m;
>
> > @@ -2158,13 +2158,8 @@ int parse_threads(int pid, struct pid **_t, int *_n)
> > return 0;
> > }
> >
> > -int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> > +int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
>
> This one becomes static, doesn't it?
Yes, good catch.
> > {
> > - FILE *f;
> > -
> > - f = fopen_proc(pid, "cgroup");
> > - if (f == NULL)
> > - return -1;
> > while (fgets(buf, BUF_SIZE, f)) {
> > struct cg_ctl *ncc, *cc;
> > char *name, *path = NULL, *e;
> > @@ -2195,6 +2190,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> >
> > ncc->name = xstrdup(name);
> > ncc->path = xstrdup(path);
> > + ncc->cgns_prefix = NULL;
> > if (!ncc->name || !ncc->path) {
> > xfree(ncc->name);
> > xfree(ncc->path);
> > @@ -2210,20 +2206,97 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
> > (*n)++;
> > }
> >
> > - fclose(f);
> > return 0;
> >
> > err:
> > put_ctls(retl);
> > - fclose(f);
> > return -1;
> > }
> >
> > +int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n)
> > +{
> > + FILE *f;
> > + int ret;
> > + LIST_HEAD(internal);
> > + unsigned int n_internal;
> > + struct cg_ctl *intern, *ext;
> > +
> > + f = fopen_proc(pid, "cgroup");
> > + if (!f) {
> > + pr_perror("couldn't open task cgroup file");
> > + return -1;
> > + }
> > +
> > + ret = parse_cgroup_file(f, retl, n);
> > + fclose(f);
> > + if (ret < 0)
> > + return -1;
> > +
> > + /* No parasite args, we're dumping criu's cg set, so we don't need to
> > + * try and parse the "internal" cgroup set to find namespace
> > + * boundaries.
> > + */
> > + if (!args)
> > + return 0;
> > +
> > + f = fmemopen(args->contents, strlen(args->contents), "r");
> > + if (!f) {
> > + pr_perror("couldn't fmemopen cgroup buffer:\n%s\n", args->contents);
> > + return -1;
> > + }
> > +
> > + ret = parse_cgroup_file(f, &internal, &n_internal);
> > + fclose(f);
> > + if (ret < 0) {
> > + pr_err("couldn't parse internal cgroup file\n");
> > + return -1;
> > + }
> > +
> > + list_for_each_entry(intern, &internal, l) {
> > + list_for_each_entry(ext, retl, l) {
> > + char *pos, tmp;
> > +
> > + if (strcmp(ext->name, intern->name))
> > + continue;
> > +
> > + /* If the cgroup namespace was unshared at / (or there
> > + * is no cgroup namespace relative to criu), the paths
> > + * are equal and we don't need to set a prefix.
> > + */
> > + if (!strcmp(ext->path, intern->path))
> > + continue;
> > +
> > + /* +1 here to chop off the leading / */
> > + pos = ext->path + strlen(ext->path) - strlen(intern->path+1);
> > + if (strcmp(pos, intern->path+1)) {
> > + pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path);
> > + ret = -1;
> > + goto out;
> > + }
> > +
> > + tmp = *pos;
> > + *pos = '\0';
> > + ext->cgns_prefix = xstrdup(ext->path);
> > + if (!ext->cgns_prefix) {
> > + ret = -1;
> > + goto out;
> > + }
> > + *pos = tmp;
> > + }
> > + }
>
> Erm... I'm not sure I fully understand what's going on in this routine.
> Would you comment on it please?
Sure, I can add some comments. Something like:
/* Here's where we actually compute the cgns prefix. Consider a task
* in /foo/bar which has unshared its namespace at /foo. The internal
* path is /bar, but the external path is /foo/bar, and the cgns
* prefix is /foo. The algorithm is:
*
* // no cg ns unshare in this case
* if (internal == external)
* continue;
* idx = find_suffix_pos(external, internal)
* cgns_prefix = external[:idx]
*/
Does that make sense?
> > +
> > +out:
> > + put_ctls(&internal);
> > + return ret;
> > +}
> > +
> > void put_ctls(struct list_head *l)
> > {
> > struct cg_ctl *c, *n;
> >
> > list_for_each_entry_safe(c, n, l, l) {
> > + if (c->cgns_prefix)
> > + xfree(c->cgns_prefix);
> > xfree(c->name);
> > xfree(c->path);
> > xfree(c);
> > diff --git a/images/cgroup.proto b/images/cgroup.proto
> > index dcd2fe8..f255c8c 100644
> > --- a/images/cgroup.proto
> > +++ b/images/cgroup.proto
> > @@ -23,8 +23,9 @@ message cg_controller_entry {
> > }
> >
> > message cg_member_entry {
> > - required string name = 1;
> > - required string path = 2;
> > + required string name = 1;
> > + required string path = 2;
> > + optional string cgns_prefix = 3;
>
> Same as for parasite args -- maybe it's more efficient to to save prefix lenght as int here?
Yep, I can make that change.
Thanks for looking!
Tycho
More information about the CRIU
mailing list