[CRIU] [PATCH 6/7] cg: Restore tasks into proper cgroups

Pavel Emelyanov xemul at parallels.com
Thu May 8 06:08:57 PDT 2014


On restore find out in which sets tasks live in and move
them there.

Optimization note -- move tasks into cgroups _before_ fork
kids to make them inherit cgroups if required. This saves
a lot of time.

Accessibility note -- when moving tasks into cgroups don't
search for existing host mounts (they may be not available)
and don't mount temporary ones (may be impossible due to
user namespaces). Instead introduce service fd with a yard
of mounts.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
---
 cgroup.c            | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 cr-restore.c        |  23 +++++-
 image.c             |  10 +++
 include/cgroup.h    |   4 ++
 include/rst_info.h  |   2 +
 include/servicefd.h |   1 +
 6 files changed, 240 insertions(+), 1 deletion(-)

diff --git a/cgroup.c b/cgroup.c
index 5541704..2d9ebad 100644
--- a/cgroup.c
+++ b/cgroup.c
@@ -30,10 +30,23 @@ struct cg_set {
 
 static LIST_HEAD(cg_sets);
 static unsigned int n_sets;
+static CgSetEntry **rst_sets;
+static char *cg_yard;
 static struct cg_set *root_cgset; /* Set root item lives in */
 static struct cg_set *criu_cgset; /* Set criu process lives in */
 static u32 cg_set_ids = 1;
 
+static CgSetEntry *find_rst_set_by_id(u32 id)
+{
+	int i;
+
+	for (i = 0; i < n_sets; i++)
+		if (rst_sets[i]->id == id)
+			return rst_sets[i];
+
+	return NULL;
+}
+
 #define CGCMP_MATCH	1	/* check for exact match */
 #define CGCMP_ISSUB	2	/* check set is subset of ctls */
 
@@ -233,3 +246,191 @@ int dump_cgroups(void)
 	pr_info("Writing CG image\n");
 	return pb_write_one(fdset_fd(glob_fdset, CR_FD_CGROUP), &cg, PB_CGROUP);
 }
+
+static int move_in_cgroup(CgSetEntry *se)
+{
+	int cg, i;
+
+	pr_info("Move into %d\n", se->id);
+	cg = get_service_fd(CGROUP_YARD);
+	for (i = 0; i < se->n_ctls; i++) {
+		char aux[1024];
+		int fd, err;
+		ControllerEntry *ce = se->ctls[i];
+
+		sprintf(aux, "%s/%s/tasks", ce->name, ce->path);
+		pr_debug("  `-> %s\n", aux);
+		err = fd = openat(cg, aux, O_WRONLY);
+		if (fd >= 0) {
+			/*
+			 * Writing zero into this file moves current
+			 * task w/o any permissions checks :)
+			 */
+			err = write(fd, "0", 1);
+			close(fd);
+		}
+
+		if (err < 0) {
+			pr_perror("Can't move into %s (%d/%d)\n",
+					aux, err, fd);
+			return -1;
+		}
+	}
+
+	close_service_fd(CGROUP_YARD);
+	return 0;
+}
+
+int prepare_task_cgroup(struct pstree_item *me)
+{
+	CgSetEntry *se;
+	u32 current_cgset;
+
+	if (!me->rst->cg_set)
+		return 0;
+
+	if (me->parent)
+		current_cgset = me->parent->rst->cg_set;
+	else
+		current_cgset = root_cg_set;
+
+	if (me->rst->cg_set == current_cgset) {
+		pr_info("Cgroups %d inherited from parent\n", current_cgset);
+		close_service_fd(CGROUP_YARD);
+		return 0;
+	}
+
+	se = find_rst_set_by_id(me->rst->cg_set);
+	if (!se) {
+		pr_err("No set %d found\n", me->rst->cg_set);
+		return -1;
+	}
+
+	return move_in_cgroup(se);
+}
+
+void fini_cgroup(void)
+{
+	if (!cg_yard)
+		return;
+
+	close_service_fd(CGROUP_YARD);
+	umount2(cg_yard, MNT_DETACH);
+	rmdir(cg_yard);
+	xfree(cg_yard);
+}
+
+/*
+ * Prepare the CGROUP_YARD service descriptor. This guy is
+ * tmpfs mount with the set of ctl->name directories each
+ * one having the respective cgroup mounted.
+ *
+ * It's required for two reasons.
+ *
+ * First, if we move more than one task into cgroups it's
+ * faster to have cgroup tree visible by them all in sime
+ * single place. Searching for this thing existing in the
+ * criu's space is not nice, as parsing /proc/mounts is not
+ * very fast, other than this not all cgroups may be mounted.
+ *
+ * Second, when we have user-namespaces support we will
+ * loose the ability to mount cgroups on-demand, so prepare
+ * them in advance.
+ */
+
+static int prepare_cgroup_sfd(CgSetEntry *root_set)
+{
+	int off, i;
+	char paux[PATH_MAX], aux[128];
+
+	pr_info("Preparing cgroups yard\n");
+
+	off = sprintf(paux, ".criu.cgyard.XXXXXX");
+	if (mkdtemp(paux) == NULL) {
+		pr_perror("Can't make temp cgyard dir");
+		return -1;
+	}
+
+	cg_yard = xstrdup(paux);
+	if (!cg_yard) {
+		rmdir(paux);
+		return -1;
+	}
+
+	if (mount("none", cg_yard, "tmpfs", 0, NULL)) {
+		pr_perror("Can't mount tmpfs in cgyard");
+		goto err;
+	}
+
+	for (i = 0; i < root_set->n_ctls; i++) {
+		ControllerEntry *ce = root_set->ctls[i];
+		char *opt = ce->name;
+
+		sprintf(paux + off, "/%s", ce->name);
+		if (strstartswith(ce->name, "name=")) {
+			sprintf(aux, "none,%s", ce->name);
+			opt = aux;
+		}
+
+		if (mkdir(paux, 0700)) {
+			pr_perror("Can't make cgyard subdir");
+			goto err;
+		}
+
+		if (mount("none", paux, "cgroup", 0, opt) < 0) {
+			pr_perror("Can't mount %s cgyard", ce->name);
+			goto err;
+		}
+	}
+
+	pr_debug("Opening %s as cg yard\n", cg_yard);
+	i = open(cg_yard, O_DIRECTORY);
+	if (i < 0) {
+		pr_perror("Can't open cgyard");
+		goto err;
+	}
+
+	off = install_service_fd(CGROUP_YARD, i);
+	close(i);
+	if (off < 0)
+		goto err;
+
+	return 0;
+
+err:
+	fini_cgroup();
+	return -1;
+}
+
+int prepare_cgroup(void)
+{
+	int fd, ret;
+	CgroupEntry *ce;
+
+	fd = open_image(CR_FD_CGROUP, O_RSTR | O_OPT);
+	if (fd < 0) {
+		if (errno == ENOENT) /* backward compatibility */
+			return 0;
+		else
+			return fd;
+	}
+
+	ret = pb_read_one_eof(fd, &ce, PB_CGROUP);
+	close(fd);
+	if (ret <= 0) /* Zero is OK -- no sets there. */
+		return ret;
+
+	n_sets = ce->n_sets;
+	rst_sets = ce->sets;
+	if (n_sets)
+		/*
+		 * We rely on the fact that all sets contain the same
+		 * set of controllers. This is checked during dump
+		 * with cg_set_compare(CGCMP_ISSUB) call.
+		 */
+		ret = prepare_cgroup_sfd(rst_sets[0]);
+	else
+		ret = 0;
+
+	return ret;
+}
diff --git a/cr-restore.c b/cr-restore.c
index e9af4e6..33c0403 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -117,6 +117,9 @@ static int crtools_prepare_shared(void)
 	if (tty_prep_fds())
 		return -1;
 
+	if (prepare_cgroup())
+		return -1;
+
 	return 0;
 }
 
@@ -902,6 +905,7 @@ static inline int fork_with_pid(struct pstree_item *item)
 			return -1;
 
 		item->state = ca.core->tc->task_state;
+		item->rst->cg_set = ca.core->tc->cg_set;
 
 		switch (item->state) {
 		case TASK_ALIVE:
@@ -914,8 +918,14 @@ static inline int fork_with_pid(struct pstree_item *item)
 			pr_err("Unknown task state %d\n", item->state);
 			return -1;
 		}
-	} else
+	} else {
+		/*
+		 * Helper entry will not get moved around and thus
+		 * will live in the parent's cgset.
+		 */
+		item->rst->cg_set = item->parent->rst->cg_set;
 		ca.core = NULL;
+	}
 
 	ret = -1;
 
@@ -1278,6 +1288,15 @@ static int restore_task_with_children(void *_arg)
 			exit(1);
 	}
 
+	/*
+	 * Call this _before_ forking to optimize cgroups
+	 * restore -- if all tasks live in one set of cgroups
+	 * we will only move the root one there, others will
+	 * just have it inherited.
+	 */
+	if (prepare_task_cgroup(current) < 0)
+		return -1;
+
 	if (create_children_and_session())
 		goto err;
 
@@ -1642,6 +1661,8 @@ int cr_restore_tasks(void)
 		goto err;
 
 	ret = restore_root_task(root_item);
+
+	fini_cgroup();
 err:
 	cr_plugin_fini();
 	return ret;
diff --git a/image.c b/image.c
index a2bc63c..566073b 100644
--- a/image.c
+++ b/image.c
@@ -15,6 +15,7 @@
 bool fdinfo_per_id = false;
 bool ns_per_id = false;
 TaskKobjIdsEntry *root_ids;
+u32 root_cg_set;
 
 int check_img_inventory(void)
 {
@@ -39,6 +40,15 @@ int check_img_inventory(void)
 		memcpy(root_ids, he->root_ids, sizeof(*root_ids));
 	}
 
+	if (he->has_root_cg_set) {
+		if (he->root_cg_set == 0) {
+			pr_err("Corrupted root cgset\n");
+			goto out_err;
+		}
+
+		root_cg_set = he->root_cg_set;
+	}
+
 	if (he->img_version != CRTOOLS_IMAGES_V1) {
 		pr_err("Not supported images version %u\n", he->img_version);
 		goto out_err;
diff --git a/include/cgroup.h b/include/cgroup.h
index 3a8cca5..148b26f 100644
--- a/include/cgroup.h
+++ b/include/cgroup.h
@@ -2,6 +2,10 @@
 #define __CR_CGROUP_H__
 #include "asm/int.h"
 struct pstree_item;
+extern u32 root_cg_set;
 int dump_task_cgroup(struct pstree_item *, u32 *);
 int dump_cgroups(void);
+int prepare_task_cgroup(struct pstree_item *);
+int prepare_cgroup(void);
+void fini_cgroup(void);
 #endif /* __CR_CGROUP_H__ */
diff --git a/include/rst_info.h b/include/rst_info.h
index 6c146cf..d4df529 100644
--- a/include/rst_info.h
+++ b/include/rst_info.h
@@ -43,6 +43,8 @@ struct rst_info {
 	struct vm_area_list	vmas;
 	struct _MmEntry		*mm;
 
+	u32			cg_set;
+
 	union {
 		struct pstree_item	*pgrp_leader;
 		futex_t			pgrp_set;
diff --git a/include/servicefd.h b/include/servicefd.h
index 89cf3f3..bdadc0f 100644
--- a/include/servicefd.h
+++ b/include/servicefd.h
@@ -16,6 +16,7 @@ enum sfd_type {
 			 *  For restore -- CRIU ns' proc
 			 */
 	ROOT_FD_OFF,	/* Root of the namespace we dump/restore */
+	CGROUP_YARD,
 
 	SERVICE_FD_MAX
 };
-- 
1.8.4.2


More information about the CRIU mailing list