[CRIU] [RFC 3/4] cgroup: add support for cgroup namespaces

Tycho Andersen tycho.andersen at canonical.com
Thu Feb 11 09:37:34 PST 2016


cgroup namespaces are imminent to be merged into the kernel (indeed, they
went into and out of 4.5 for minor issues), and will be carried as a
patchset in the ubuntu 16.04 kernel. Here's an attempt at c/r.

There are essentially three key steps:
  * on dump, in parse_task_cgroup, we should ask the task what cgroups it
    thinks it is in (unless it has the same cgroup ns id as its parent, then we
    should just take the prefixes from the parent's set), and set the prefix on
    the cg set
  * add a new restore step, prepare_cgroup_namespace(), which happens in
    prepare_task_cgroup() that does an unshare() if necessary
  * when restoring, in move_in_cgroup, if we're going to restore via usernsd,
    leave the full path. if not, use (cgset->path + len(cgset->cgns_prefix) as
    the path, since we will have already moved into the cgns_prefix and unshared.

Another observation here is that we can support nesting, since these are
restored heirarchically by nature.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 cgroup.c                | 54 ++++++++++++++++++++++++++-----
 include/cgroup.h        |  6 ++++
 include/namespaces.h    |  8 ++++-
 include/proc_parse.h    |  3 +-
 include/syscall-types.h |  6 +++-
 namespaces.c            | 18 +++++++++++
 proc_parse.c            | 85 ++++++++++++++++++++++++++++++++++++++++++++-----
 protobuf/cgroup.proto   |  5 +--
 protobuf/core.proto     |  1 +
 9 files changed, 165 insertions(+), 21 deletions(-)

diff --git a/cgroup.c b/cgroup.c
index 704f144..1ed0025 100644
--- a/cgroup.c
+++ b/cgroup.c
@@ -18,6 +18,7 @@
 #include "util-pie.h"
 #include "namespaces.h"
 #include "seize.h"
+#include "syscall-types.h"
 #include "protobuf.h"
 #include "protobuf/core.pb-c.h"
 #include "protobuf/cgroup.pb-c.h"
@@ -148,6 +149,11 @@ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what)
 		if (strcmp(c1->name, c2->name))
 			return false;
 
+		/* must have the same cgns prefix to be considered equal */
+		if (((long)c1->cgns_prefix ^ (long)c2->cgns_prefix) ||
+		    (c1->cgns_prefix && strcmp(c1->cgns_prefix, c2->cgns_prefix)))
+			return false;
+
 		switch (what) {
 		case CGCMP_MATCH:
 			if (strcmp(c1->path, c2->path))
@@ -662,18 +668,25 @@ static int collect_cgroups(struct list_head *ctls)
 
 int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
 {
-	int pid;
+	int pid, virt_pid;
 	LIST_HEAD(ctls);
 	unsigned int n_ctls = 0;
 	struct cg_set *cs;
 
-	if (item)
+	if (item) {
 		pid = item->pid.real;
-	else
+
+		/* we only need to resolve the virtual pid's cgroups if we
+		 * actually have cgns enabled.
+		 */
+		virt_pid = item->ids->has_cgroup_ns_id ? item->pid.virt : -1;
+	} else {
 		pid = getpid();
+		virt_pid = -1;
+	}
 
 	pr_info("Dumping cgroups for %d\n", pid);
-	if (parse_task_cgroup(pid, &ctls, &n_ctls))
+	if (parse_task_cgroup(pid, virt_pid, &ctls, &n_ctls))
 		return -1;
 
 	cs = get_cg_set(&ctls, n_ctls);
@@ -891,6 +904,7 @@ static int dump_sets(CgroupEntry *cg)
 			cg_member_entry__init(ce);
 			ce->name = ctl->name;
 			ce->path = ctl->path;
+			ce->cgns_prefix = ctl->cgns_prefix;
 			se->ctls[c++] = ce++;
 		}
 
@@ -997,13 +1011,13 @@ static int userns_move(void *arg, int fd, pid_t pid)
 	return 0;
 }
 
-static int move_in_cgroup(CgSetEntry *se)
+static int move_in_cgroup(CgSetEntry *se, bool do_cgns_set)
 {
 	int i;
 
 	pr_info("Move into %d\n", se->id);
 	for (i = 0; i < se->n_ctls; i++) {
-		char aux[PATH_MAX];
+		char aux[PATH_MAX], *path;
 		int fd = -1, err, j, aux_off;
 		CgMemberEntry *ce = se->ctls[i];
 		CgControllerEntry *ctrl = NULL;
@@ -1023,7 +1037,28 @@ static int move_in_cgroup(CgSetEntry *se)
 
 		aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
 
-		snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
+		if (do_cgns_set && ce->cgns_prefix) {
+			snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->cgns_prefix);
+			if (userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1) < 0) {
+				pr_perror("couldn't set cgns prefix %s", aux);
+				return -1;
+			}
+
+			if (unshare(CLONE_NEWCGROUP) < 0) {
+				pr_perror("couldn't unshare cgns");
+				return -1;
+			}
+		}
+
+		/* Since above we entered the cgns prefix, if we're not going to
+		 * restore via usernsd, we need to adjust the path here because
+		 * we're now in the cgns prefix path instead of /.
+		 */
+		path = ce->path;
+		if (ce->cgns_prefix && !(root_ns_mask & CLONE_NEWUSER))
+			path += strlen(ce->cgns_prefix);
+
+		snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", path);
 		pr_debug("  `-> %s\n", aux);
 		err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
 		if (err < 0) {
@@ -1039,6 +1074,7 @@ int prepare_task_cgroup(struct pstree_item *me)
 {
 	CgSetEntry *se;
 	u32 current_cgset;
+	bool same_cgns_as_parent = false;
 
 	if (!rsti(me)->cg_set)
 		return 0;
@@ -1059,7 +1095,7 @@ int prepare_task_cgroup(struct pstree_item *me)
 		return -1;
 	}
 
-	return move_in_cgroup(se);
+	return move_in_cgroup(se, !same_cgns_as_parent);
 }
 
 void fini_cgroup(void)
@@ -1569,3 +1605,5 @@ int new_cg_root_add(char *controller, char *newroot)
 	list_add(&o->node, &opts.new_cgroup_roots);
 	return 0;
 }
+
+struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup");
diff --git a/include/cgroup.h b/include/cgroup.h
index 393ee3d..13eb684 100644
--- a/include/cgroup.h
+++ b/include/cgroup.h
@@ -1,6 +1,10 @@
 #ifndef __CR_CGROUP_H__
 #define __CR_CGROUP_H__
+
 #include "asm/int.h"
+
+#include "protobuf/core.pb-c.h"
+
 struct pstree_item;
 extern u32 root_cg_set;
 int dump_task_cgroup(struct pstree_item *, u32 *);
@@ -62,4 +66,6 @@ struct cg_controller *new_controller(const char *name);
 /* parse all global cgroup information into structures */
 int parse_cg_info(void);
 int new_cg_root_add(char *controller, char *newroot);
+
+extern struct ns_desc cgroup_ns_desc;
 #endif /* __CR_CGROUP_H__ */
diff --git a/include/namespaces.h b/include/namespaces.h
index 4ce5a34..ac938e2 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -4,8 +4,14 @@
 #include "compiler.h"
 #include "files.h"
 
+/* including syscall-types.h gives another weird error; do we really need to
+ * define this twice? */
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP	0x02000000
+#endif
+
 /* Nested namespaces are supported only for these types */
-#define CLONE_SUBNS	(CLONE_NEWNS)
+#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWCGROUP)
 
 struct ns_desc {
 	unsigned int	cflag;
diff --git a/include/proc_parse.h b/include/proc_parse.h
index 33cd077..0f97579 100644
--- a/include/proc_parse.h
+++ b/include/proc_parse.h
@@ -195,13 +195,14 @@ struct cg_ctl {
 	struct list_head l;
 	char *name;
 	char *path;
+	char *cgns_prefix;
 };
 
 /*
  * Returns the list of cg_ctl-s sorted by name
  */
 
-extern int parse_task_cgroup(int pid, struct list_head *l, unsigned int *n);
+extern int parse_task_cgroup(int pid, int virt_pid, struct list_head *l, unsigned int *n);
 extern void put_ctls(struct list_head *);
 
 int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
diff --git a/include/syscall-types.h b/include/syscall-types.h
index e3a114d..b056f6d 100644
--- a/include/syscall-types.h
+++ b/include/syscall-types.h
@@ -65,7 +65,11 @@ struct itimerspec;
 #define CLONE_NEWUSER	0x10000000
 #endif
 
-#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP	0x02000000
+#endif
+
+#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
 
 #define setns	sys_setns
 
diff --git a/namespaces.c b/namespaces.c
index 9a7836b..c7e4302 100644
--- a/namespaces.c
+++ b/namespaces.c
@@ -19,6 +19,7 @@
 #include "pstree.h"
 #include "namespaces.h"
 #include "net.h"
+#include "cgroup.h"
 
 #include "protobuf.h"
 #include "protobuf/ns.pb-c.h"
@@ -31,6 +32,7 @@ static struct ns_desc *ns_desc_array[] = {
 	&pid_ns_desc,
 	&user_ns_desc,
 	&mnt_ns_desc,
+	&cgroup_ns_desc,
 };
 
 static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d)
@@ -374,6 +376,10 @@ static int open_ns_fd(struct file_desc *d)
 			item = t;
 			nd = &mnt_ns_desc;
 			break;
+		} else if (ids->cgroup_ns_id == nfi->nfe->ns_id) {
+			item = t;
+			nd = &cgroup_ns_desc;
+			break;
 		}
 	}
 
@@ -489,6 +495,13 @@ int dump_task_ns_ids(struct pstree_item *item)
 		return -1;
 	}
 
+	ids->has_cgroup_ns_id = true;
+	ids->cgroup_ns_id = get_ns_id(pid, &cgroup_ns_desc);
+	if (!ids->cgroup_ns_id) {
+		pr_err("Can't make cgroup id\n");
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -735,6 +748,11 @@ static int do_dump_namespaces(struct ns_id *ns)
 				ns->id, ns->ns_pid);
 		ret = dump_net_ns(ns->id);
 		break;
+	case CLONE_NEWCGROUP:
+		pr_info("Dump CGROUP namespace info %d via %d\n",
+				ns->id, ns->ns_pid);
+		/* handled separately in cgroup dumping code */
+		break;
 	default:
 		pr_err("Unknown namespace flag %x\n", ns->nd->cflag);
 		break;
diff --git a/proc_parse.c b/proc_parse.c
index c7c5775..8a3edff 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -2153,13 +2153,8 @@ int parse_threads(int pid, struct pid **_t, int *_n)
 	return 0;
 }
 
-int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
+int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
 {
-	FILE *f;
-
-	f = fopen_proc(pid, "cgroup");
-	if (f == NULL)
-		return -1;
 	while (fgets(buf, BUF_SIZE, f)) {
 		struct cg_ctl *ncc, *cc;
 		char *name, *path = NULL, *e;
@@ -2190,6 +2185,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
 
 		ncc->name = xstrdup(name);
 		ncc->path = xstrdup(path);
+		ncc->cgns_prefix = NULL;
 		if (!ncc->name || !ncc->path) {
 			xfree(ncc->name);
 			xfree(ncc->path);
@@ -2205,20 +2201,93 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
 		(*n)++;
 	}
 
-	fclose(f);
 	return 0;
 
 err:
 	put_ctls(retl);
-	fclose(f);
 	return -1;
 }
 
+int parse_task_cgroup(int pid, int virt_pid, struct list_head *retl, unsigned int *n)
+{
+	FILE *f;
+	int ret;
+	char buf[PATH_MAX];
+	LIST_HEAD(internal);
+	unsigned int n_internal;
+	struct cg_ctl *intern, *ext;
+
+	f = fopen_proc(pid, "cgroup");
+	if (!f) {
+		pr_perror("couldn't open task cgroup file");
+		return -1;
+	}
+
+	ret = parse_cgroup_file(f, retl, n);
+	fclose(f);
+	if (ret < 0)
+		return -1;
+
+	if (virt_pid < 0)
+		return 0;
+
+	snprintf(buf, sizeof(buf), "%d/cgroup", virt_pid);
+	f = fopenat(get_service_fd(CR_PROC_FD_OFF), buf, "r");
+	if (!f) {
+		pr_perror("couldn't open task cgroup file from inside");
+		return -1;
+	}
+
+	ret = parse_cgroup_file(f, &internal, &n_internal);
+	fclose(f);
+	if (ret < 0) {
+		pr_err("couldn't parse internal cgroup file");
+		return -1;
+	}
+
+	list_for_each_entry(intern, &internal, l) {
+		list_for_each_entry(ext, retl, l) {
+			char *pos, tmp;
+
+			if (strcmp(ext->name, intern->name))
+				continue;
+
+			pos = strstr(ext->path, intern->path);
+			if (!pos) {
+				ret = -1;
+				pr_err("invalid cgroup configuration %s is not in %s\n", intern->path, ext->path);
+				goto out;
+			}
+
+			/* there is no cgroup namespace, or it was unshared at
+			 * /; in either case, we don't need to do anything
+			 * fancy */
+			if (pos == ext->path)
+				continue;
+
+			tmp = *pos;
+			*pos = '\0';
+			ext->cgns_prefix = xstrdup(ext->path);
+			if (!ext->cgns_prefix) {
+				ret = -1;
+				goto out;
+			}
+			*pos = tmp;
+		}
+	}
+
+out:
+	put_ctls(&internal);
+	return ret;
+}
+
 void put_ctls(struct list_head *l)
 {
 	struct cg_ctl *c, *n;
 
 	list_for_each_entry_safe(c, n, l, l) {
+		if (c->cgns_prefix)
+			xfree(c->cgns_prefix);
 		xfree(c->name);
 		xfree(c->path);
 		xfree(c);
diff --git a/protobuf/cgroup.proto b/protobuf/cgroup.proto
index dcd2fe8..f255c8c 100644
--- a/protobuf/cgroup.proto
+++ b/protobuf/cgroup.proto
@@ -23,8 +23,9 @@ message cg_controller_entry {
 }
 
 message cg_member_entry {
-	required string name	= 1;
-	required string path	= 2;
+	required string name		= 1;
+	required string path		= 2;
+	optional string cgns_prefix	= 3;
 }
 
 message cg_set_entry {
diff --git a/protobuf/core.proto b/protobuf/core.proto
index 6def5d9..824ee26 100644
--- a/protobuf/core.proto
+++ b/protobuf/core.proto
@@ -57,6 +57,7 @@ message task_kobj_ids_entry {
 	optional uint32			uts_ns_id	= 8;
 	optional uint32			mnt_ns_id	= 9;
 	optional uint32			user_ns_id	= 10;
+	optional uint32			cgroup_ns_id	= 11;
 }
 
 message thread_sas_entry {
-- 
2.6.4



More information about the CRIU mailing list