[CRIU] [PATCH v2 2/5] cgroup: add support for cgroup namespaces

Tycho Andersen tycho.andersen at canonical.com
Sun Feb 21 11:28:50 PST 2016


cgroup namespaces are imminent to be merged into the kernel (indeed, they
went into and out of 4.5 for minor issues), and will be carried as a
patchset in the ubuntu 16.04 kernel. Here's an attempt at c/r.

There are essentially three key steps:
  * on dump, in parse_task_cgroup, we should ask the task what cgroups it
    thinks it is in (unless it has the same cgroup ns id as its parent, then we
    should just take the prefixes from the parent's set), and set the prefix on
    the cg set
  * add a new restore step, prepare_cgroup_namespace(), which happens in
    prepare_task_cgroup() that does an unshare() if necessary
  * when restoring, in move_in_cgroup, if we're going to restore via usernsd,
    leave the full path. if not, use (cgset->path + len(cgset->cgns_prefix) as
    the path, since we will have already moved into the cgns_prefix and unshared.

Another observation here is that we can support nesting, since these are
restored heirarchically by nature.

v2: * store cgns prefix length instead of full prefix in images
    * set has_cgroup_ns_id conditionally
    * drop unused argument to move_in_cgroup
    * add extra comments about what is happening when unsharing() on
      restore
    * add extra comments about what is happening when computing the actual
      cgns prefix

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 criu/cgroup.c                   | 56 +++++++++++++++++++++++--
 criu/cr-dump.c                  | 17 +++++++-
 criu/image.c                    |  2 +-
 criu/include/cgroup.h           | 10 ++++-
 criu/include/namespaces.h       |  8 +++-
 criu/include/parasite-syscall.h |  3 ++
 criu/include/parasite.h         | 11 +++++
 criu/include/proc_parse.h       |  3 +-
 criu/include/syscall-types.h    |  6 ++-
 criu/namespaces.c               | 41 ++++++++++++++-----
 criu/parasite-syscall.c         | 16 ++++++++
 criu/pie/parasite.c             | 62 ++++++++++++++++++++++++----
 criu/proc_parse.c               | 91 +++++++++++++++++++++++++++++++++++++----
 images/cgroup.proto             |  5 ++-
 images/core.proto               |  1 +
 15 files changed, 294 insertions(+), 38 deletions(-)

diff --git a/criu/cgroup.c b/criu/cgroup.c
index 735f749..cf474f9 100644
--- a/criu/cgroup.c
+++ b/criu/cgroup.c
@@ -18,6 +18,7 @@
 #include "util-pie.h"
 #include "namespaces.h"
 #include "seize.h"
+#include "syscall-types.h"
 #include "protobuf.h"
 #include "images/core.pb-c.h"
 #include "images/cgroup.pb-c.h"
@@ -150,6 +151,10 @@ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what)
 
 		switch (what) {
 		case CGCMP_MATCH:
+			/* must have the same cgns prefix to be considered equal */
+			if (c1->cgns_prefix != c2->cgns_prefix)
+				return false;
+
 			if (strcmp(c1->path, c2->path))
 				return false;
 
@@ -191,7 +196,7 @@ static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls)
 			struct cg_ctl *ctl;
 
 			list_for_each_entry(ctl, &cs->ctls, l)
-				pr_debug("    `- [%s] -> [%s]\n", ctl->name, ctl->path);
+				pr_debug("    `- [%s] -> [%s] [%u]\n", ctl->name, ctl->path, ctl->cgns_prefix);
 		}
 	}
 
@@ -658,7 +663,7 @@ static int collect_cgroups(struct list_head *ctls)
 	return 0;
 }
 
-int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
+int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args)
 {
 	int pid;
 	LIST_HEAD(ctls);
@@ -671,7 +676,7 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
 		pid = getpid();
 
 	pr_info("Dumping cgroups for %d\n", pid);
-	if (parse_task_cgroup(pid, &ctls, &n_ctls))
+	if (parse_task_cgroup(pid, args, &ctls, &n_ctls))
 		return -1;
 
 	cs = get_cg_set(&ctls, n_ctls);
@@ -889,6 +894,10 @@ static int dump_sets(CgroupEntry *cg)
 			cg_member_entry__init(ce);
 			ce->name = ctl->name;
 			ce->path = ctl->path;
+			if (ctl->cgns_prefix > 0) {
+				ce->has_cgns_prefix = true;
+				ce->cgns_prefix = ctl->cgns_prefix;
+			}
 			se->ctls[c++] = ce++;
 		}
 
@@ -1021,6 +1030,45 @@ static int move_in_cgroup(CgSetEntry *se)
 
 		aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
 
+		/* We need to do an unshare() here as unshare() pins the root
+		 * of the cgroup namespace to whatever the current cgroups are.
+		 * For example, consider a task in a cgroup (according to the
+		 * host):
+		 *
+		 * /unsprefix/insidecontainer
+		 *
+		 * If the task first moved itself into /unsprefix, then did unshare(),
+		 * when the task examines its own /proc/self/cgroup file it will see /,
+		 * but to the host it is really in /unsprefix. Then if it further enters
+		 * /insidecontainer here, the full host path will be
+		 * /unsprefix/insidecontianer. There is no way to say "set the cgroup
+		 * namespace boundary at /unsprefix" without first entering that, doing
+		 * the unshare, and then entering the rest of the path.
+		 */
+		if (ce->has_cgns_prefix) {
+			char tmp = ce->path[ce->cgns_prefix];
+			ce->path[ce->cgns_prefix] = '\0';
+
+			pr_info("setting cgns prefix to %s\n", ce->path);
+			snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
+			ce->path[ce->cgns_prefix] = tmp;
+			if (userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1) < 0) {
+				pr_perror("couldn't set cgns prefix %s", aux);
+				return -1;
+			}
+
+			if (unshare(CLONE_NEWCGROUP) < 0) {
+				pr_perror("couldn't unshare cgns");
+				return -1;
+			}
+		}
+
+		/* Note that unshare(CLONE_NEWCGROUP) doesn't change the view
+		 * of previously mounted cgroupfses; since we're restoring via
+		 * a dirfd pointing to the cg yard set up by when criu was in
+		 * the root cgns, we still want to use the full path here when
+		 * we move into the cgroup.
+		 */
 		snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
 		pr_debug("  `-> %s\n", aux);
 		err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
@@ -1567,3 +1615,5 @@ int new_cg_root_add(char *controller, char *newroot)
 	list_add(&o->node, &opts.new_cgroup_roots);
 	return 0;
 }
+
+struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup");
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 27cc1d6..baf4548 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -689,6 +689,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
 	pid_t pid = item->pid.real;
 	int ret = -1;
 	struct proc_status_creds *creds;
+	struct parasite_dump_cgroup_args cgroup_args, *info = NULL;
 
 	pr_info("\n");
 	pr_info("Dumping core (pid: %d)\n", pid);
@@ -727,8 +728,22 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
 	if (ret)
 		goto err;
 
+	/* If this is the root task and it has a cgroup ns id, it could be in
+	 * a cgroup namespace and we should try to figure out the prefix. Or,
+	 * if the task is not the parent task and its cgroup namespace differs
+	 * from its parent's, this is a nested cgns and we should compute the
+	 * prefix.
+	 */
+	if (item->ids->has_cgroup_ns_id && (!item->parent ||
+			(item->ids->cgroup_ns_id != item->parent->ids->cgroup_ns_id))) {
+		info = &cgroup_args;
+		ret = parasite_dump_cgroup(ctl, &cgroup_args);
+		if (ret)
+			goto err;
+	}
+
 	core->tc->has_cg_set = true;
-	ret = dump_task_cgroup(item, &core->tc->cg_set);
+	ret = dump_task_cgroup(item, &core->tc->cg_set, info);
 	if (ret)
 		goto err;
 
diff --git a/criu/image.c b/criu/image.c
index 83e62d3..4349210 100644
--- a/criu/image.c
+++ b/criu/image.c
@@ -117,7 +117,7 @@ int prepare_inventory(InventoryEntry *he)
 		return -1;
 
 	he->has_root_cg_set = true;
-	if (dump_task_cgroup(NULL, &he->root_cg_set))
+	if (dump_task_cgroup(NULL, &he->root_cg_set, NULL))
 		return -1;
 
 	he->root_ids = crt.i.ids;
diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h
index 393ee3d..c162bb2 100644
--- a/criu/include/cgroup.h
+++ b/criu/include/cgroup.h
@@ -1,9 +1,15 @@
 #ifndef __CR_CGROUP_H__
 #define __CR_CGROUP_H__
+
 #include "asm/int.h"
+
+#include "parasite.h"
+
+#include "images/core.pb-c.h"
+
 struct pstree_item;
 extern u32 root_cg_set;
-int dump_task_cgroup(struct pstree_item *, u32 *);
+int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args);
 int dump_cgroups(void);
 int prepare_task_cgroup(struct pstree_item *);
 int prepare_cgroup(void);
@@ -62,4 +68,6 @@ struct cg_controller *new_controller(const char *name);
 /* parse all global cgroup information into structures */
 int parse_cg_info(void);
 int new_cg_root_add(char *controller, char *newroot);
+
+extern struct ns_desc cgroup_ns_desc;
 #endif /* __CR_CGROUP_H__ */
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index eba0fac..18c1fd9 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -4,8 +4,14 @@
 #include "compiler.h"
 #include "files.h"
 
+/* including syscall-types.h gives another weird error; do we really need to
+ * define this twice? */
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP	0x02000000
+#endif
+
 /* Nested namespaces are supported only for these types */
-#define CLONE_SUBNS	(CLONE_NEWNS)
+#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWCGROUP)
 
 struct ns_desc {
 	unsigned int	cflag;
diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h
index 57612df..5ed8e35 100644
--- a/criu/include/parasite-syscall.h
+++ b/criu/include/parasite-syscall.h
@@ -19,6 +19,7 @@ struct list_head;
 struct cr_imgset;
 struct fd_opts;
 struct pid;
+struct parasite_dump_cgroup_args;
 
 struct thread_ctx {
 	k_rtsigset_t		sigmask;
@@ -105,6 +106,8 @@ extern struct parasite_ctl *parasite_prep_ctl(pid_t pid,
 					      struct vm_area_list *vma_area_list);
 extern int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size);
 
+extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup);
+
 extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type);
 
 extern int parasite_init_threads_seized(struct parasite_ctl *ctl, struct pstree_item *item);
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
index d0afe17..e3383d8 100644
--- a/criu/include/parasite.h
+++ b/criu/include/parasite.h
@@ -48,6 +48,7 @@ enum {
 	PARASITE_CMD_DUMP_TTY,
 	PARASITE_CMD_CHECK_VDSO_MARK,
 	PARASITE_CMD_CHECK_AIOS,
+	PARASITE_CMD_DUMP_CGROUP,
 
 	PARASITE_CMD_MAX,
 };
@@ -245,6 +246,16 @@ struct parasite_tty_args {
 	int	st_excl;
 };
 
+struct parasite_dump_cgroup_args {
+	/* We choose PAGE_SIZE here since that's how big parasite messages are,
+	 * although this is probably longer than any /proc/pid/cgroup file will
+	 * ever be on most systems (4k).
+	 *
+	 * The string is null terminated.
+	 */
+	char contents[PAGE_SIZE];
+};
+
 /* the parasite prefix is added by gen_offsets.sh */
 #define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
 
diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h
index 04fecd9..ae8e713 100644
--- a/criu/include/proc_parse.h
+++ b/criu/include/proc_parse.h
@@ -195,13 +195,14 @@ struct cg_ctl {
 	struct list_head l;
 	char *name;
 	char *path;
+	u32 cgns_prefix;
 };
 
 /*
  * Returns the list of cg_ctl-s sorted by name
  */
 
-extern int parse_task_cgroup(int pid, struct list_head *l, unsigned int *n);
+extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n);
 extern void put_ctls(struct list_head *);
 
 int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
index e3a114d..b056f6d 100644
--- a/criu/include/syscall-types.h
+++ b/criu/include/syscall-types.h
@@ -65,7 +65,11 @@ struct itimerspec;
 #define CLONE_NEWUSER	0x10000000
 #endif
 
-#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP	0x02000000
+#endif
+
+#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
 
 #define setns	sys_setns
 
diff --git a/criu/namespaces.c b/criu/namespaces.c
index bb5b622..d896313 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -19,6 +19,7 @@
 #include "pstree.h"
 #include "namespaces.h"
 #include "net.h"
+#include "cgroup.h"
 
 #include "protobuf.h"
 #include "images/ns.pb-c.h"
@@ -31,6 +32,7 @@ static struct ns_desc *ns_desc_array[] = {
 	&pid_ns_desc,
 	&user_ns_desc,
 	&mnt_ns_desc,
+	&cgroup_ns_desc,
 };
 
 static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d)
@@ -272,7 +274,7 @@ found:
 	return nsid->id;
 }
 
-static unsigned int __get_ns_id(int pid, struct ns_desc *nd, struct ns_id **ns)
+static unsigned int __get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported, struct ns_id **ns)
 {
 	int proc_dir, ret;
 	unsigned int kid;
@@ -299,12 +301,14 @@ static unsigned int __get_ns_id(int pid, struct ns_desc *nd, struct ns_id **ns)
 	BUG_ON(!kid);
 
 out:
+	if (supported)
+		*supported = kid != 0;
 	return generate_ns_id(pid, kid, nd, ns);
 }
 
-static unsigned int get_ns_id(int pid, struct ns_desc *nd)
+static unsigned int get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported)
 {
-	return __get_ns_id(pid, nd, NULL);
+	return __get_ns_id(pid, nd, supported, NULL);
 }
 
 int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p)
@@ -374,6 +378,10 @@ static int open_ns_fd(struct file_desc *d)
 			item = t;
 			nd = &mnt_ns_desc;
 			break;
+		} else if (ids->cgroup_ns_id == nfi->nfe->ns_id) {
+			item = t;
+			nd = &cgroup_ns_desc;
+			break;
 		}
 	}
 
@@ -433,10 +441,10 @@ int predump_task_ns_ids(struct pstree_item *item)
 {
 	int pid = item->pid.real;
 
-	if (!__get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns))
+	if (!__get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns))
 		return -1;
 
-	if (!get_ns_id(pid, &mnt_ns_desc))
+	if (!get_ns_id(pid, &mnt_ns_desc, NULL))
 		return -1;
 
 	return 0;
@@ -448,47 +456,53 @@ int dump_task_ns_ids(struct pstree_item *item)
 	TaskKobjIdsEntry *ids = item->ids;
 
 	ids->has_pid_ns_id = true;
-	ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc);
+	ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc, NULL);
 	if (!ids->pid_ns_id) {
 		pr_err("Can't make pidns id\n");
 		return -1;
 	}
 
 	ids->has_net_ns_id = true;
-	ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns);
+	ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns);
 	if (!ids->net_ns_id) {
 		pr_err("Can't make netns id\n");
 		return -1;
 	}
 
 	ids->has_ipc_ns_id = true;
-	ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc);
+	ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc, NULL);
 	if (!ids->ipc_ns_id) {
 		pr_err("Can't make ipcns id\n");
 		return -1;
 	}
 
 	ids->has_uts_ns_id = true;
-	ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc);
+	ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc, NULL);
 	if (!ids->uts_ns_id) {
 		pr_err("Can't make utsns id\n");
 		return -1;
 	}
 
 	ids->has_mnt_ns_id = true;
-	ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc);
+	ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc, NULL);
 	if (!ids->mnt_ns_id) {
 		pr_err("Can't make mntns id\n");
 		return -1;
 	}
 
 	ids->has_user_ns_id = true;
-	ids->user_ns_id = get_ns_id(pid, &user_ns_desc);
+	ids->user_ns_id = get_ns_id(pid, &user_ns_desc, NULL);
 	if (!ids->user_ns_id) {
 		pr_err("Can't make userns id\n");
 		return -1;
 	}
 
+	ids->cgroup_ns_id = get_ns_id(pid, &cgroup_ns_desc, &ids->has_cgroup_ns_id);
+	if (!ids->cgroup_ns_id) {
+		pr_err("Can't make cgroup id\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -814,6 +828,11 @@ static int do_dump_namespaces(struct ns_id *ns)
 				ns->id, ns->ns_pid);
 		ret = dump_net_ns(ns->id);
 		break;
+	case CLONE_NEWCGROUP:
+		pr_info("Dump CGROUP namespace info %d via %d\n",
+				ns->id, ns->ns_pid);
+		/* handled separately in cgroup dumping code */
+		break;
 	default:
 		pr_err("Unknown namespace flag %x\n", ns->nd->cflag);
 		break;
diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
index b9f1b8c..2b13da2 100644
--- a/criu/parasite-syscall.c
+++ b/criu/parasite-syscall.c
@@ -1275,6 +1275,22 @@ int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size)
 	return ret;
 }
 
+int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup)
+{
+	int ret;
+	struct parasite_dump_cgroup_args *ca;
+
+	ca = parasite_args(ctl, struct parasite_dump_cgroup_args);
+	ret = parasite_execute_daemon(PARASITE_CMD_DUMP_CGROUP, ctl);
+	if (ret) {
+		pr_err("Parasite failed to dump /proc/self/cgroup\n");
+		return ret;
+	}
+
+	*cgroup = *ca;
+	return 0;
+}
+
 static unsigned long parasite_args_size = PARASITE_ARG_SIZE_MIN;
 void parasite_ensure_args_size(unsigned long sz)
 {
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index 2466665..3dbd241 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -294,9 +294,9 @@ static int dump_thread(struct parasite_dump_thread *args)
 }
 
 static char proc_mountpoint[] = "proc.crtools";
-static int parasite_get_proc_fd()
+static int get_proc_fd(void)
 {
-	int ret, fd = -1;
+	int ret;
 	char buf[2];
 
 	ret = sys_readlinkat(AT_FDCWD, "/proc/self", buf, sizeof(buf));
@@ -307,8 +307,7 @@ static int parasite_get_proc_fd()
 
 	/* Fast path -- if /proc belongs to this pidns */
 	if (ret == 1 && buf[0] == '1') {
-		fd = sys_open("/proc", O_RDONLY, 0);
-		goto out_send_fd;
+		return sys_open("/proc", O_RDONLY, 0);
 	}
 
 	ret = sys_mkdir(proc_mountpoint, 0700);
@@ -324,10 +323,19 @@ static int parasite_get_proc_fd()
 		return -1;
 	}
 
-	fd = open_detach_mount(proc_mountpoint);
-out_send_fd:
-	if (fd < 0)
-		return fd;
+	return open_detach_mount(proc_mountpoint);
+}
+
+static int parasite_get_proc_fd()
+{
+	int fd, ret;
+
+	fd = get_proc_fd();
+	if (fd < 0) {
+		pr_err("Can't get /proc fd\n");
+		return -1;
+	}
+
 	ret = send_fd(tsock, NULL, 0, fd);
 	sys_close(fd);
 	return ret;
@@ -513,6 +521,41 @@ static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
 }
 #endif
 
+static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args)
+{
+	int proc, cgroup, len;
+
+	proc = get_proc_fd();
+	if (proc < 0) {
+		pr_err("can't get /proc fd\n");
+		return -1;
+	}
+
+	cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0);
+	sys_close(proc);
+	if (cgroup < 0) {
+		pr_err("can't get /proc/self/cgroup fd\n");
+		sys_close(cgroup);
+		return -1;
+	}
+
+	len = sys_read(cgroup, args->contents, sizeof(args->contents));
+	sys_close(cgroup);
+	if (len < 0) {
+		pr_err("can't read /proc/self/cgroup %d\n", len);
+		return -1;
+	}
+
+	if (len == sizeof(*args)) {
+		pr_warn("/proc/self/cgroup was bigger than the page size\n");
+		return -1;
+	}
+
+	/* null terminate */
+	args->contents[len] = 0;
+	return 0;
+}
+
 static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
 {
 	struct ctl_msg m;
@@ -643,6 +686,9 @@ static noinline __used int noinline parasite_daemon(void *args)
 		case PARASITE_CMD_CHECK_VDSO_MARK:
 			ret = parasite_check_vdso_mark(args);
 			break;
+		case PARASITE_CMD_DUMP_CGROUP:
+			ret = parasite_dump_cgroup(args);
+			break;
 		default:
 			pr_err("Unknown command in parasite daemon thread leader: %d\n", m.cmd);
 			ret = -1;
diff --git a/criu/proc_parse.c b/criu/proc_parse.c
index 9825da9..e900b6e 100644
--- a/criu/proc_parse.c
+++ b/criu/proc_parse.c
@@ -2153,13 +2153,8 @@ int parse_threads(int pid, struct pid **_t, int *_n)
 	return 0;
 }
 
-int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
+int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
 {
-	FILE *f;
-
-	f = fopen_proc(pid, "cgroup");
-	if (f == NULL)
-		return -1;
 	while (fgets(buf, BUF_SIZE, f)) {
 		struct cg_ctl *ncc, *cc;
 		char *name, *path = NULL, *e;
@@ -2190,6 +2185,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
 
 		ncc->name = xstrdup(name);
 		ncc->path = xstrdup(path);
+		ncc->cgns_prefix = 0;
 		if (!ncc->name || !ncc->path) {
 			xfree(ncc->name);
 			xfree(ncc->path);
@@ -2205,15 +2201,94 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
 		(*n)++;
 	}
 
-	fclose(f);
 	return 0;
 
 err:
 	put_ctls(retl);
-	fclose(f);
 	return -1;
 }
 
+int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n)
+{
+	FILE *f;
+	int ret;
+	LIST_HEAD(internal);
+	unsigned int n_internal;
+	struct cg_ctl *intern, *ext;
+
+	f = fopen_proc(pid, "cgroup");
+	if (!f) {
+		pr_perror("couldn't open task cgroup file");
+		return -1;
+	}
+
+	ret = parse_cgroup_file(f, retl, n);
+	fclose(f);
+	if (ret < 0)
+		return -1;
+
+	/* No parasite args, we're dumping criu's cg set, so we don't need to
+	 * try and parse the "internal" cgroup set to find namespace
+	 * boundaries.
+	 */
+	if (!args)
+		return 0;
+
+	f = fmemopen(args->contents, strlen(args->contents), "r");
+	if (!f) {
+		pr_perror("couldn't fmemopen cgroup buffer:\n%s\n", args->contents);
+		return -1;
+	}
+
+	ret = parse_cgroup_file(f, &internal, &n_internal);
+	fclose(f);
+	if (ret < 0) {
+		pr_err("couldn't parse internal cgroup file\n");
+		return -1;
+	}
+
+	/* Here's where we actually compute the cgns prefix. Consider a task
+	 * in /foo/bar which has unshared its namespace at /foo. The internal
+	 * path is /bar, but the external path is /foo/bar, and the cgns
+	 * prefix is /foo. The algorithm is:
+	 *
+	 * // no cg ns unshare in this case
+	 * if (internal == external)
+	 *   continue;
+	 * idx = find_suffix_pos(external, internal)
+	 * cgns_prefix = external[:idx]
+	 */
+	list_for_each_entry(intern, &internal, l) {
+		list_for_each_entry(ext, retl, l) {
+			char *pos;
+
+			if (strcmp(ext->name, intern->name))
+				continue;
+
+			/* If the cgroup namespace was unshared at / (or there
+			 * is no cgroup namespace relative to criu), the paths
+			 * are equal and we don't need to set a prefix.
+			 */
+			if (!strcmp(ext->path, intern->path))
+				continue;
+
+			/* +1 here to chop off the leading / */
+			pos = ext->path + strlen(ext->path) - strlen(intern->path+1);
+			if (strcmp(pos, intern->path+1)) {
+				pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path);
+				ret = -1;
+				goto out;
+			}
+
+			ext->cgns_prefix = pos - ext->path;
+		}
+	}
+
+out:
+	put_ctls(&internal);
+	return ret;
+}
+
 void put_ctls(struct list_head *l)
 {
 	struct cg_ctl *c, *n;
diff --git a/images/cgroup.proto b/images/cgroup.proto
index dcd2fe8..e4154f2 100644
--- a/images/cgroup.proto
+++ b/images/cgroup.proto
@@ -23,8 +23,9 @@ message cg_controller_entry {
 }
 
 message cg_member_entry {
-	required string name	= 1;
-	required string path	= 2;
+	required string name		= 1;
+	required string path		= 2;
+	optional uint32 cgns_prefix	= 3;
 }
 
 message cg_set_entry {
diff --git a/images/core.proto b/images/core.proto
index 6def5d9..824ee26 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -57,6 +57,7 @@ message task_kobj_ids_entry {
 	optional uint32			uts_ns_id	= 8;
 	optional uint32			mnt_ns_id	= 9;
 	optional uint32			user_ns_id	= 10;
+	optional uint32			cgroup_ns_id	= 11;
 }
 
 message thread_sas_entry {
-- 
2.5.0



More information about the CRIU mailing list