[CRIU] [PATCH] Attempt to restore cgroups

Andrew Vagin avagin at parallels.com
Mon Jul 14 14:17:12 PDT 2014


Hi Tycho,

Jenkins worried about this patch. All jobs, which executes test in parallel,
start to fail after this patch. Tycho, could you investigate were is a problem?

$ bash -x test/jenkins/criu-dump.sh
...
Test: zdtm/live/static/sock_opts01, Result: FAIL
==================================== ERROR ====================================
Test: zdtm/live/static/sock_opts01, Namespace: 1
Dump log   : /root/criu/test/dump/sock_opts01/28469/1/dump.log
--------------------------------- grep Error ---------------------------------
(00.021290) Error (cgroup.c:418): cg: failed walking /root/criu/test/dump/sock_opts00/28471/1/.criu.cgmounts.JocWZf// for empty cgroups
(00.021303) Error (cr-dump.c:1601): Dump core (pid: 28469) failed with -1
(00.025737) Error (cr-dump.c:1914): Dumping FAILED.
------------------------------------- END -------------------------------------
================================= ERROR OVER =================================


On Tue, Jul 08, 2014 at 12:36:41PM -0500, Tycho Andersen wrote:
> During the dump phase, /proc/cgroups is parsed to find co-mounted cgroups.
> Then, for each task /proc/self/cgroup is parsed for the cgroups that it is a
> member of, and that cgroup is traversed to find any child cgroups which may
> also need restoring. All of this information is persisted along with the
> original cg_sets, which indicate which cgroups a task is a member of.
> 
> On restore, an initial phase creates all the cgroups which were saved and
> attempts to restore any peroperties they had. Then the tasks are restored into
> their respective cgroups via cg_sets as usual.
> 
> Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> ---
>  cgroup.c                         | 534 +++++++++++++++++++++++++++++++++++++--
>  cr-dump.c                        |   3 +
>  cr-restore.c                     |   6 +-
>  include/cgroup.h                 |  47 ++++
>  include/proc_parse.h             |   3 +
>  include/util.h                   |  14 +-
>  mount.c                          |   3 +
>  proc_parse.c                     |  75 +++++-
>  protobuf/cgroup.proto            |  16 +-
>  protobuf/mnt.proto               |   1 +
>  test/zdtm.sh                     |   2 +
>  test/zdtm/live/static/Makefile   |   1 +
>  test/zdtm/live/static/cgroup01.c | 111 ++++++++
>  util.c                           |  77 ++++++
>  14 files changed, 874 insertions(+), 19 deletions(-)
>  create mode 100644 test/zdtm/live/static/cgroup01.c
> 
> diff --git a/cgroup.c b/cgroup.c
> index 1fe5e6d..bbd14ae 100644
> --- a/cgroup.c
> +++ b/cgroup.c
> @@ -5,6 +5,9 @@
>  #include <unistd.h>
>  #include <sys/mount.h>
>  #include <sys/stat.h>
> +#include <ftw.h>
> +#include <libgen.h>
> +#include "list.h"
>  #include "xmalloc.h"
>  #include "cgroup.h"
>  #include "pstree.h"
> @@ -18,7 +21,8 @@
>  /*
>   * This structure describes set of controller groups
>   * a task lives in. The cg_ctl entries are stored in
> - * the @ctls list sorted by the .name field.
> + * the @ctls list sorted by the .name field and then
> + * by the .path field.
>   */
>  
>  struct cg_set {
> @@ -36,6 +40,13 @@ static struct cg_set *root_cgset; /* Set root item lives in */
>  static struct cg_set *criu_cgset; /* Set criu process lives in */
>  static u32 cg_set_ids = 1;
>  
> +static LIST_HEAD(cgroups);
> +static unsigned int n_cgroups;
> +static struct mount_info *cg_mntinfo;
> +
> +static CgControllerEntry **cg_controllers;
> +static unsigned int n_controllers;
> +
>  static CgSetEntry *find_rst_set_by_id(u32 id)
>  {
>  	int i;
> @@ -118,6 +129,314 @@ static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls)
>  	return cs;
>  }
>  
> +struct cg_controller *new_controller(const char *name, int heirarchy)
> +{
> +	struct cg_controller *nc = xmalloc(sizeof(*nc));
> +	if (!nc)
> +		return NULL;
> +
> +	nc->controllers = xmalloc(sizeof(char *));
> +	if (!nc->controllers) {
> +		xfree(nc);
> +		return NULL;
> +	}
> +
> +	nc->controllers[0] = xstrdup(name);
> +	if (!nc->controllers[0]) {
> +		xfree(nc->controllers);
> +		xfree(nc);
> +		return NULL;
> +	}
> +
> +	nc->n_controllers = 1;
> +	nc->heirarchy = heirarchy;
> +
> +	nc->n_heads = 0;
> +	INIT_LIST_HEAD(&nc->heads);
> +
> +	return nc;
> +}
> +
> +int parse_cg_info(void)
> +{
> +	if (parse_cgroups(&cgroups, &n_cgroups) < 0)
> +		return -1;
> +
> +	cg_mntinfo = parse_mountinfo(getpid(), NULL);
> +
> +	if (!cg_mntinfo)
> +		return -1;
> +	return 0;
> +}
> +
> +static int get_cgroup_mount_point(const char *controller, char *path)
> +{
> +	struct mount_info *m;
> +	char name[1024];
> +
> +	for (m = cg_mntinfo; m != NULL; m = m->next) {
> +		if (strcmp(m->fstype->name, "cgroup") == 0) {
> +			char *start, *end;
> +
> +			start = strstr(m->options, "name=");
> +			if (start) {
> +				/* strlen("name=") == 5 */
> +				start = start + 5;
> +
> +				end = strstr(start, ",");
> +				if (end) {
> +					strncpy(name, start, end - start);
> +					name[end - start] = '\0';
> +				} else
> +					strcpy(name, start);
> +			} else {
> +				start = strrchr(m->mountpoint, '/');
> +				if (!start) {
> +					pr_err("bad path %s\n", m->mountpoint);
> +					return -1;
> +				}
> +				strcpy(name, start+1);
> +			}
> +
> +			if (strcmp(name, controller) == 0) {
> +				/* skip the leading '.' in mountpoint */
> +				strcpy(path, m->mountpoint + 1);
> +				return 0;
> +			}
> +		}
> +	}
> +
> +	return -1;
> +}
> +
> +/* This is for use in add_cgroup() as additional arguments for the ftw()
> + * callback */
> +static struct cg_controller	*current_controller;
> +
> +#define EXACT_MATCH	0
> +#define PARENT_MATCH	1
> +#define NO_MATCH	2
> +
> +static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir)
> +{
> +	struct cgroup_dir *d;
> +	list_for_each_entry(d, dirs, siblings) {
> +		if (strcmp(d->path, path) == 0) {
> +			*rdir = d;
> +			return EXACT_MATCH;
> +		}
> +
> +		if (strstartswith(path, d->path)) {
> +			int ret = find_dir(path, &d->children, rdir);
> +			if (ret == NO_MATCH) {
> +				*rdir = d;
> +				return PARENT_MATCH;
> +			}
> +			return ret;
> +
> +		}
> +	}
> +
> +	return NO_MATCH;
> +}
> +
> +static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag)
> +{
> +	struct cgroup_dir *ncd = NULL, *match;
> +	int ret = 0;
> +	char pbuf[PATH_MAX];
> +
> +	if (typeflag == FTW_D) {
> +		FILE *f;
> +		int mtype;
> +		struct mount_info *mi;
> +
> +		strncpy(pbuf, fpath, PATH_MAX);
> +
> +		pr_info("adding cgroup %s\n", fpath);
> +
> +		ncd = xmalloc(sizeof(*ncd));
> +		if (!ncd) {
> +			ret = -1;
> +			goto out;
> +		}
> +		ncd->path = NULL;
> +
> +		for (mi = cg_mntinfo; mi != NULL; mi = mi->next) {
> +			if (is_path_prefix(fpath, mi->mountpoint + 1)) {
> +				ncd->path = xstrdup(fpath + strlen(mi->mountpoint));
> +				if (!ncd->path) {
> +					ret = -1;
> +					goto out;
> +				}
> +				break;
> +			}
> +		}
> +
> +		if (!ncd->path) {
> +			/* We couldn't find fpath in mountinfo, which means we
> +			 * mounted it ourselves, so we just chop off the first
> +			 * strlen(".criu.cgmounts.XXXXXX").
> +			 */
> +			ncd->path = xstrdup(fpath + 21);
> +			if (!ncd->path) {
> +				ret = -1;
> +				goto out;
> +			}
> +		}
> +
> +		mtype = find_dir(ncd->path, &current_controller->heads, &match);
> +
> +		switch (mtype) {
> +		/* ignore co-mounted cgroups */
> +		case EXACT_MATCH:
> +			goto out;
> +		case PARENT_MATCH:
> +			list_add_tail(&ncd->siblings, &match->children);
> +			match->n_children++;
> +			break;
> +		case NO_MATCH:
> +			list_add_tail(&ncd->siblings, &current_controller->heads);
> +			current_controller->n_heads++;
> +			break;
> +		}
> +
> +		INIT_LIST_HEAD(&ncd->children);
> +		ncd->n_children = 0;
> +		ncd->controller = current_controller;
> +
> +		ncd->flags = 0;
> +
> +		snprintf(pbuf, PATH_MAX, "%s/memory.limit_in_bytes", fpath);
> +		f = fopen(pbuf, "r");
> +		if (f) {
> +			if (fscanf(f, "%" SCNu64, &ncd->mem_limit) != 1) {
> +				pr_err("Failed scanning %s\n", pbuf);
> +				ret = -1;
> +				goto out;
> +			}
> +			ncd->flags |= HAS_MEM_LIMIT;
> +			fclose(f);
> +		}
> +
> +		snprintf(pbuf, PATH_MAX, "%s/cpu.shares", fpath);
> +		f = fopen(pbuf, "r");
> +		if (f) {
> +			if (fscanf(f, "%" SCNu32, &ncd->cpu_shares) != 1) {
> +				pr_err("Failed scanning %s for u32\n", pbuf);
> +				ret = -1;
> +				goto out;
> +			}
> +			ncd->flags |= HAS_CPU_SHARES;
> +			fclose(f);
> +		}
> +
> +		return 0;
> +	}
> +
> +out:
> +	if (ncd) {
> +		if (ncd->path)
> +			xfree(ncd->path);
> +		xfree(ncd);
> +	}
> +
> +	return ret;
> +}
> +
> +static int collect_cgroups(struct list_head *ctls)
> +{
> +	struct cg_ctl *cc;
> +	int ret = 0;
> +
> +	list_for_each_entry(cc, ctls, l) {
> +		char path[PATH_MAX];
> +		char *name, mount_point[PATH_MAX], prefix[] = ".criu.cgmounts.XXXXXX";
> +		bool temp_mount = false;
> +		struct cg_controller *cg;
> +		int i;
> +
> +		if (strstartswith(cc->name, "name="))
> +			name = cc->name + 5;
> +		else
> +			name = cc->name;
> +
> +		if (get_cgroup_mount_point(name, mount_point) < 0) {
> +			/* Someone is trying to dump a process that is in
> +			 * a controller that isn't mounted, so we mount it for
> +			 * them.
> +			 */
> +			char opts[1024];
> +			temp_mount = true;
> +
> +			if (mkdtemp(prefix) == NULL) {
> +				pr_perror("can't make dir for cg mounts\n");
> +				return -1;
> +			}
> +
> +			if (name == cc->name)
> +				sprintf(opts, "%s", name);
> +			else
> +				sprintf(opts, "none,%s", cc->name);
> +
> +			if (mount("none", prefix, "cgroup", 0, opts) < 0) {
> +				pr_perror("couldn't mount %s\n", opts);
> +				rmdir(prefix);
> +				return -1;
> +			}
> +
> +			strcpy(mount_point, prefix);
> +		}
> +
> +		snprintf(path, PATH_MAX, "%s/%s", mount_point, cc->path);
> +
> +		current_controller = NULL;
> +
> +		/* We should get all the "real" (i.e. not name=systemd type)
> +		 * controller from parse_cgroups(), so find that controller if
> +		 * it exists. */
> +		list_for_each_entry(cg, &cgroups, l) {
> +			for (i = 0; i < cg->n_controllers; i++) {
> +				if (strcmp(cg->controllers[i], cc->name) == 0) {
> +					current_controller = cg;
> +					break;
> +				}
> +			}
> +		}
> +
> +		if (!current_controller) {
> +			/* only allow "fake" controllers to be created this way */
> +			if (!strstartswith(cc->name, "name=")) {
> +				pr_err("controller %s not found\n", cc->name);
> +				ret = -1;
> +				goto out;
> +			} else {
> +				struct cg_controller *nc = new_controller(cc->name, -1);
> +				list_add_tail(&nc->l, &cg->l);
> +				n_cgroups++;
> +				current_controller = nc;
> +			}
> +		}
> +
> +		ret = ftw(path, add_cgroup, 4);
> +		if (ret < 0) {
> +			pr_perror("failed walking %s for empty cgroups\n", path);
> +			goto out;
> +		}
> +
> +out:
> +		if (temp_mount) {
> +			umount(prefix);
> +			rmdir(prefix);
> +		}
> +
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
>  {
>  	int pid;
> @@ -134,6 +453,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
>  	if (parse_task_cgroup(pid, &ctls, &n_ctls))
>  		return -1;
>  
> +	if (item == root_item && collect_cgroups(&ctls) < 0)
> +		return -1;
> +
>  	cs = get_cg_set(&ctls, n_ctls);
>  	if (!cs)
>  		return -1;
> @@ -152,6 +474,74 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
>  	return 0;
>  }
>  
> +static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents)
> +{
> +	struct cgroup_dir *cur;
> +	CgroupDirEntry *cde;
> +	void *m;
> +	int i = 0;
> +
> +	m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry)));
> +	*ents = m;
> +	if (!m)
> +		return -1;
> +
> +	cde = m + n_dirs * sizeof(CgroupDirEntry *);
> +
> +	list_for_each_entry(cur, dirs, siblings) {
> +		cgroup_dir_entry__init(cde);
> +
> +		cde->path = cur->path;
> +		cde->has_mem_limit = cur->flags & HAS_MEM_LIMIT;
> +		cde->mem_limit = cur->mem_limit;
> +		cde->has_cpu_shares = cur->flags & HAS_CPU_SHARES;
> +		cde->cpu_shares = cur->cpu_shares;
> +
> +		cde->n_children = cur->n_children;
> +		if (cur->n_children > 0)
> +			if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children) < 0) {
> +				xfree(*ents);
> +				return -1;
> +			}
> +		(*ents)[i++] = cde++;
> +	}
> +
> +	return 0;
> +}
> +
> +static int dump_controllers(CgroupEntry *cg)
> +{
> +	struct cg_controller *cur;
> +	CgControllerEntry *ce;
> +	void *m;
> +	int i;
> +
> +	cg->n_controllers = n_cgroups;
> +	m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry)));
> +	cg->controllers = m;
> +	ce = m + cg->n_controllers * sizeof(CgControllerEntry *);
> +	if (!m)
> +		return -1;
> +
> +	i = 0;
> +	list_for_each_entry(cur, &cgroups, l) {
> +		cg_controller_entry__init(ce);
> +
> +		ce->controllers = cur->controllers;
> +		ce->n_controllers = cur->n_controllers;
> +		ce->n_dirs = cur->n_heads;
> +		if (ce->n_dirs > 0)
> +			if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs) < 0) {
> +				xfree(cg->controllers);
> +				return -1;
> +			}
> +		cg->controllers[i++] = ce++;
> +	}
> +
> +	return 0;
> +}
> +
> +
>  static int dump_sets(CgroupEntry *cg)
>  {
>  	struct cg_set *set;
> @@ -242,6 +632,8 @@ int dump_cgroups(void)
>  
>  	if (dump_sets(&cg))
>  		return -1;
> +	if (dump_controllers(&cg))
> +		return -1;
>  
>  	pr_info("Writing CG image\n");
>  	return pb_write_one(fdset_fd(glob_fdset, CR_FD_CGROUP), &cg, PB_CGROUP);
> @@ -323,6 +715,97 @@ void fini_cgroup(void)
>  	xfree(cg_yard);
>  }
>  
> +static int prepare_cgroup_dir_properties(char *controller, CgroupDirEntry **ents, unsigned int n_ents)
> +{
> +	size_t i;
> +	int cg;
> +
> +	cg = get_service_fd(CGROUP_YARD);
> +
> +	for (i = 0; i < n_ents; i++) {
> +		CgroupDirEntry *e = ents[i];
> +		char path[PATH_MAX];
> +
> +		if (e->has_mem_limit) {
> +			FILE *f;
> +
> +			sprintf(path, "%s/%s/memory.limit_in_bytes", controller, e->path);
> +
> +			f = fopenat(cg, path, "w+");
> +			if (!f) {
> +				pr_perror("Couldn't open %s for writing\n", path);
> +				return -1;
> +			}
> +
> +			fprintf(f, "%" SCNu64, e->mem_limit);
> +			fclose(f);
> +		}
> +
> +		if (e->has_cpu_shares) {
> +			FILE *f;
> +
> +			sprintf(path, "%s/%s/cpu.shares", controller, e->path);
> +
> +			f = fopenat(cg, path, "w+");
> +			if (!f) {
> +				pr_perror("Couldn't open %s for writing\n", path);
> +				return -1;
> +			}
> +
> +			fprintf(f, "%" SCNu32, e->cpu_shares);
> +			fclose(f);
> +		}
> +
> +		if (prepare_cgroup_dir_properties(controller, e->children, e->n_children) < 0)
> +			return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +int prepare_cgroup_properties(void)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < n_controllers; i++) {
> +		CgControllerEntry *c = cg_controllers[i];
> +
> +		if (c->n_controllers < 1) {
> +			pr_err("Each CgControllerEntry should have at least 1 contrller\n");
> +			return -1;
> +		}
> +
> +		/* Here we just restore properties of the first controller.
> +		 * Since they are co-mounted everything will propagate.
> +		 */
> +		if (prepare_cgroup_dir_properties(c->controllers[0], c->dirs, c->n_dirs) < 0)
> +			return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int prepare_cgroup_dirs(char *paux, size_t off, CgroupDirEntry **ents, size_t n_ents)
> +{
> +	size_t i;
> +	CgroupDirEntry *e;
> +
> +	for (i = 0; i < n_ents; i++) {
> +		e = ents[i];
> +
> +		sprintf(paux + off, "/%s", e->path);
> +
> +		if (mkdirp(paux)) {
> +			pr_perror("Can't make cgroup dir %s", paux);
> +			return -1;
> +		}
> +
> +		prepare_cgroup_dirs(paux, off, e->children, e->n_children);
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * Prepare the CGROUP_YARD service descriptor. This guy is
>   * tmpfs mount with the set of ctl->name directories each
> @@ -341,10 +824,10 @@ void fini_cgroup(void)
>   * them in advance.
>   */
>  
> -static int prepare_cgroup_sfd(CgSetEntry *root_set)
> +static int prepare_cgroup_sfd(CgroupEntry *ce)
>  {
>  	int off, i;
> -	char paux[PATH_MAX], aux[128];
> +	char paux[PATH_MAX];
>  
>  	pr_info("Preparing cgroups yard\n");
>  
> @@ -370,26 +853,44 @@ static int prepare_cgroup_sfd(CgSetEntry *root_set)
>  		goto err;
>  	}
>  
> -	for (i = 0; i < root_set->n_ctls; i++) {
> -		CgMemberEntry *ce = root_set->ctls[i];
> -		char *opt = ce->name;
> +	for (i = 0; i < ce->n_controllers; i++) {
> +		CgControllerEntry *ctrl = ce->controllers[i];
> +		int j, name_off, opt_off;
> +		char *name, opt[1024];
>  
> -		if (strstartswith(ce->name, "name=")) {
> -			sprintf(paux + off, "/%s", ce->name + 5);
> -			sprintf(aux, "none,%s", ce->name);
> -			opt = aux;
> -		} else
> -			sprintf(paux + off, "/%s", ce->name);
> +		if (ctrl->n_controllers < 1) {
> +			pr_err("Each cg_controller_entry must have at least 1 controller");
> +			goto err;
> +		}
> +
> +		if (strstartswith(ctrl->controllers[0], "name=")) {
> +			name = ctrl->controllers[0] + 5;
> +			opt_off = sprintf(opt, "none,%s", ctrl->controllers[0]);
> +		} else {
> +			name = ctrl->controllers[0];
> +			opt_off = sprintf(opt, "%s", ctrl->controllers[0]);
> +		}
> +
> +		for (j = 1; j < ctrl->n_controllers; j++) {
> +			name = ctrl->controllers[i];
> +			opt_off += sprintf(opt + opt_off, ",%s", ctrl->controllers[i]);
> +		}
> +
> +		name_off = sprintf(paux + off, "/%s", name);
>  
>  		if (mkdir(paux, 0700)) {
> -			pr_perror("Can't make cgyard subdir");
> +			pr_perror("Can't make cgyard subdir %s", paux);
>  			goto err;
>  		}
>  
>  		if (mount("none", paux, "cgroup", 0, opt) < 0) {
> -			pr_perror("Can't mount %s cgyard", ce->name);
> +			pr_perror("Can't mount %s cgyard", paux);
>  			goto err;
>  		}
> +
> +		if (prepare_cgroup_dirs(paux, off + name_off, ctrl->dirs, ctrl->n_dirs))
> +			goto err;
> +
>  	}
>  
>  	pr_debug("Opening %s as cg yard\n", cg_yard);
> @@ -431,13 +932,16 @@ int prepare_cgroup(void)
>  
>  	n_sets = ce->n_sets;
>  	rst_sets = ce->sets;
> +
> +	n_controllers = ce->n_controllers;
> +	cg_controllers = ce->controllers;
>  	if (n_sets)
>  		/*
>  		 * We rely on the fact that all sets contain the same
>  		 * set of controllers. This is checked during dump
>  		 * with cg_set_compare(CGCMP_ISSUB) call.
>  		 */
> -		ret = prepare_cgroup_sfd(rst_sets[0]);
> +		ret = prepare_cgroup_sfd(ce);
>  	else
>  		ret = 0;
>  
> diff --git a/cr-dump.c b/cr-dump.c
> index 45f1f5f..d8ad0fc 100644
> --- a/cr-dump.c
> +++ b/cr-dump.c
> @@ -1776,6 +1776,9 @@ int cr_dump_tasks(pid_t pid)
>  	if (vdso_init())
>  		goto err;
>  
> +	if (parse_cg_info())
> +		goto err;
> +
>  	if (write_img_inventory())
>  		goto err;
>  
> diff --git a/cr-restore.c b/cr-restore.c
> index 7d43aab..f9068fc 100644
> --- a/cr-restore.c
> +++ b/cr-restore.c
> @@ -1708,8 +1708,12 @@ int cr_restore_tasks(void)
>  	if (crtools_prepare_shared() < 0)
>  		goto err;
>  
> -	ret = restore_root_task(root_item);
> +	if (restore_root_task(root_item) < 0)
> +		goto err_fc;
>  
> +	ret = prepare_cgroup_properties();
> +
> +err_fc:
>  	fini_cgroup();
>  err:
>  	cr_plugin_fini();
> diff --git a/include/cgroup.h b/include/cgroup.h
> index 148b26f..fc386fa 100644
> --- a/include/cgroup.h
> +++ b/include/cgroup.h
> @@ -7,5 +7,52 @@ int dump_task_cgroup(struct pstree_item *, u32 *);
>  int dump_cgroups(void);
>  int prepare_task_cgroup(struct pstree_item *);
>  int prepare_cgroup(void);
> +/* Restore things like cpu_limit in known cgroups. */
> +int prepare_cgroup_properties(void);
>  void fini_cgroup(void);
> +
> +#define HAS_MEM_LIMIT	(1 << 0)
> +#define HAS_CPU_SHARES	(1 << 1)
> +
> +struct cg_controller;
> +
> +/* This describes a particular cgroup path, e.g. the '/lxc/u1' part of
> + * 'blkio/lxc/u1' and any properties it has.
> + */
> +struct cgroup_dir {
> +	char			*path;
> +	u64			mem_limit;
> +	u32			cpu_shares;
> +	unsigned int		flags;
> +
> +	/* this is how children are linked together */
> +	struct list_head	siblings;
> +
> +	/* more cgroup_dirs */
> +	struct list_head	children;
> +	unsigned int		n_children;
> +
> +	struct cg_controller	*controller;
> +};
> +
> +/* This describes a particular cgroup controller, e.g. blkio or cpuset.
> + * The heads are subdirectories organized in their tree format.
> + */
> +struct cg_controller {
> +	int			heirarchy;
> +	unsigned int		n_controllers;
> +	char			**controllers;
> +
> +	/* cgroup_dirs */
> +	struct list_head 	heads;
> +	unsigned int		n_heads;
> +
> +	/* for cgroup list in cgroup.c */
> +	struct list_head	l;
> +};
> +struct cg_controller *new_controller(const char *name, int heirarchy);
> +
> +/* parse all global cgroup information into structures */
> +int parse_cg_info(void);
> +
>  #endif /* __CR_CGROUP_H__ */
> diff --git a/include/proc_parse.h b/include/proc_parse.h
> index b153328..ff1ea5d 100644
> --- a/include/proc_parse.h
> +++ b/include/proc_parse.h
> @@ -5,6 +5,7 @@
>  #include "asm/types.h"
>  #include "image.h"
>  #include "list.h"
> +#include "cgroup.h"
>  
>  #include "protobuf/eventfd.pb-c.h"
>  #include "protobuf/eventpoll.pb-c.h"
> @@ -203,4 +204,6 @@ struct cg_ctl {
>  extern int parse_task_cgroup(int pid, struct list_head *l, unsigned int *n);
>  extern void put_ctls(struct list_head *);
>  
> +int parse_cgroups(struct list_head *cgroups, unsigned int *n_cgroups);
> +
>  #endif /* __CR_PROC_PARSE_H__ */
> diff --git a/include/util.h b/include/util.h
> index 22a0f3d..522fc33 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -288,7 +288,7 @@ int vaddr_to_pfn(unsigned long vaddr, u64 *pfn);
>  /*
>   * Check whether @str starts with @sub
>   */
> -static inline bool strstartswith(char *str, char *sub)
> +static inline bool strstartswith(const char *str, const char *sub)
>  {
>  	while (1) {
>  		if (*sub == '\0') /* end of sub -- match */
> @@ -303,4 +303,16 @@ static inline bool strstartswith(char *str, char *sub)
>  	}
>  }
>  
> +/*
> + * mkdir -p
> + */
> +int mkdirp(const char *path);
> +
> +/*
> + * Tests whether a path is a prefix of another path. This is different than
> + * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they
> + * refer to different directories.
> + */
> +bool is_path_prefix(const char *path, const char *prefix);
> +FILE *fopenat(int dirfd, char *path, char *cflags);
>  #endif /* __CR_UTIL_H__ */
> diff --git a/mount.c b/mount.c
> index 4d84f48..32410eb 100644
> --- a/mount.c
> +++ b/mount.c
> @@ -861,6 +861,9 @@ static struct fstype fstypes[] = {
>  	}, {
>  		.name = "debugfs",
>  		.code = FSTYPE__DEBUGFS,
> +	}, {
> +		.name = "cgroup",
> +		.code = FSTYPE__CGROUP,
>  	}
>  };
>  
> diff --git a/proc_parse.c b/proc_parse.c
> index f2ea897..88f3c0a 100644
> --- a/proc_parse.c
> +++ b/proc_parse.c
> @@ -1547,7 +1547,7 @@ int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
>  		}
>  
>  		list_for_each_entry(cc, retl, l)
> -			if (strcmp(cc->name, name) >= 0)
> +			if (strcmp(cc->name, name) >= 0 && strcmp(cc->path, path) >= 0)
>  				break;
>  
>  		list_add_tail(&ncc->l, &cc->l);
> @@ -1573,3 +1573,76 @@ void put_ctls(struct list_head *l)
>  		xfree(c);
>  	}
>  }
> +
> +
> +/* Parse and create all the real controllers. This does not include things with
> + * the "name=" prefix, e.g. systemd.
> + */
> +int parse_cgroups(struct list_head *cgroups, unsigned int *n_cgroups)
> +{
> +	FILE *f;
> +	char buf[1024], name[1024];
> +	int heirarchy, ret = 0;
> +	struct cg_controller *cur = NULL;
> +
> +	f = fopen("/proc/cgroups", "r");
> +	if (!f) {
> +		pr_perror("failed opening /proc/cgroups");
> +		return -1;
> +	}
> +
> +	/* throw away the header */
> +	if (!fgets(buf, 1024, f)) {
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	while (fgets(buf, 1024, f)) {
> +		char *n;
> +		char found = 0;
> +
> +		sscanf(buf, "%s %d", name, &heirarchy);
> +		list_for_each_entry(cur, cgroups, l) {
> +			if (cur->heirarchy == heirarchy) {
> +				void *m;
> +
> +				found = 1;
> +				cur->n_controllers++;
> +				m = xrealloc(cur->controllers, sizeof(char *) * cur->n_controllers);
> +				if (!m) {
> +					ret = -1;
> +					goto out;
> +				}
> +
> +				cur->controllers = m;
> +				if (!cur->controllers) {
> +					ret = -1;
> +					goto out;
> +				}
> +
> +				n = xstrdup(name);
> +				if (!n) {
> +					ret = -1;
> +					goto out;
> +				}
> +
> +				cur->controllers[cur->n_controllers-1] = n;
> +				break;
> +			}
> +		}
> +
> +		if (!found) {
> +			struct cg_controller *nc = new_controller(name, heirarchy);
> +			if (!nc) {
> +				ret = -1;
> +				goto out;
> +			}
> +			list_add_tail(&nc->l, &cur->l);
> +			(*n_cgroups)++;
> +		}
> +	}
> +
> +out:
> +	fclose(f);
> +	return ret;
> +}
> diff --git a/protobuf/cgroup.proto b/protobuf/cgroup.proto
> index 139a3ad..4be2249 100644
> --- a/protobuf/cgroup.proto
> +++ b/protobuf/cgroup.proto
> @@ -1,3 +1,16 @@
> +message cgroup_dir_entry {
> +	required string 		path		= 1;
> +	optional uint64 		mem_limit 	= 2;
> +	optional uint32 		cpu_shares	= 3;
> +	repeated cgroup_dir_entry	children 	= 4;
> +}
> +
> +message cg_controller_entry {
> +	required uint32			id		= 1;
> +	repeated string			controllers	= 2;
> +	repeated cgroup_dir_entry	dirs		= 3;
> +}
> +
>  message cg_member_entry {
>  	required string name	= 1;
>  	required string path	= 2;
> @@ -9,5 +22,6 @@ message cg_set_entry {
>  }
>  
>  message cgroup_entry {
> -	repeated cg_set_entry	sets	= 1;
> +	repeated cg_set_entry		sets		= 1;
> +	repeated cg_controller_entry	controllers	= 2;
>  }
> diff --git a/protobuf/mnt.proto b/protobuf/mnt.proto
> index 63532ee..603bb37 100644
> --- a/protobuf/mnt.proto
> +++ b/protobuf/mnt.proto
> @@ -11,6 +11,7 @@ enum fstype {
>  	SECURITYFS		= 9;
>  	FUSECTL			= 10;
>  	DEBUGFS			= 11;
> +	CGROUP			= 12;
>  };
>  
>  message mnt_entry {
> diff --git a/test/zdtm.sh b/test/zdtm.sh
> index 4c3f2d2..38e7484 100755
> --- a/test/zdtm.sh
> +++ b/test/zdtm.sh
> @@ -167,6 +167,7 @@ ns/static/tun
>  static/netns-nf
>  static/netns
>  static/cgroup00
> +static/cgroup01
>  ns/static/clean_mntns
>  "
>  
> @@ -198,6 +199,7 @@ bind-mount
>  mountpoints
>  inotify_irmap
>  cgroup00
> +cgroup01
>  clean_mntns
>  "
>  
> diff --git a/test/zdtm/live/static/Makefile b/test/zdtm/live/static/Makefile
> index f41fd80..893a250 100644
> --- a/test/zdtm/live/static/Makefile
> +++ b/test/zdtm/live/static/Makefile
> @@ -160,6 +160,7 @@ TST_DIR		=				\
>  		tempfs				\
>  		bind-mount			\
>  		cgroup00			\
> +		cgroup01			\
>  
>  TST_DIR_FILE	=				\
>  		chroot				\
> diff --git a/test/zdtm/live/static/cgroup01.c b/test/zdtm/live/static/cgroup01.c
> new file mode 100644
> index 0000000..f6a082f
> --- /dev/null
> +++ b/test/zdtm/live/static/cgroup01.c
> @@ -0,0 +1,111 @@
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <fcntl.h>
> +#include <sys/stat.h>
> +#include <sys/mount.h>
> +#include "zdtmtst.h"
> +
> +const char *test_doc	= "Check that empty cgroups are preserved";
> +const char *test_author	= "Tycho Andersen <tycho.andersen at canonical.com>";
> +
> +char *dirname;
> +TEST_OPTION(dirname, string, "cgroup directory name", 1);
> +static const char *cgname = "zdtmtst";
> +static const char *subname = "subcg";
> +static const char *empty = "empty";
> +
> +int main(int argc, char **argv)
> +{
> +	int cgfd, l, ret = 1;
> +	char aux[1024], paux[1024];
> +	FILE *cgf;
> +	struct stat st;
> +
> +	test_init(argc, argv);
> +
> +	if (mkdir(dirname, 0700) < 0) {
> +		err("Can't make dir");
> +		goto out;
> +	}
> +
> +	sprintf(aux, "none,name=%s", cgname);
> +	if (mount("none", dirname, "cgroup", 0, aux)) {
> +		err("Can't mount cgroups");
> +		goto out_rd;
> +	}
> +
> +	sprintf(paux, "%s/%s", dirname, subname);
> +	mkdir(paux, 0600);
> +
> +	l = sprintf(aux, "%d", getpid());
> +	sprintf(paux, "%s/%s/tasks", dirname, subname);
> +
> +	cgfd = open(paux, O_WRONLY);
> +	if (cgfd < 0) {
> +		err("Can't open tasks");
> +		goto out_rs;
> +	}
> +
> +	l = write(cgfd, aux, l);
> +	close(cgfd);
> +
> +	if (l < 0) {
> +		err("Can't move self to subcg");
> +		goto out_rs;
> +	}
> +
> +	sprintf(paux, "%s/%s/%s", dirname, subname, empty);
> +	mkdir(paux, 0600);
> +
> +	test_daemon();
> +	test_waitsig();
> +
> +	cgf = fopen("/proc/self/mountinfo", "r");
> +	if (cgf == NULL) {
> +		fail("No mountinfo file");
> +		goto out_rs;
> +	}
> +
> +	while (fgets(paux, sizeof(paux), cgf)) {
> +		char *s;
> +
> +		s = strstr(paux, cgname);
> +		if (s) {
> +			sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux);
> +			test_msg("found cgroup at %s\n", aux);
> +			sprintf(paux, "%s/%s/%s", aux, subname, empty);
> +			if (stat(paux, &st)) {
> +				fail("couldn't stat %s\n", paux);
> +				ret = -1;
> +				goto out_close;
> +			}
> +
> +			if (!S_ISDIR(st.st_mode)) {
> +				fail("%s is not a directory\n", paux);
> +				ret = -1;
> +				goto out_close;
> +			}
> +
> +			pass();
> +			ret = 0;
> +			goto out_close;
> +		}
> +	}
> +
> +	fail("empty cgroup not found!\n");
> +
> +out_close:
> +	fclose(cgf);
> +
> +	sprintf(paux, "%s/%s/%s", dirname, subname, empty);
> +	rmdir(paux);
> +out_rs:
> +	sprintf(paux, "%s/%s", dirname, subname);
> +	rmdir(paux);
> +	umount(dirname);
> +out_rd:
> +	rmdir(dirname);
> +out:
> +	return ret;
> +}
> diff --git a/util.c b/util.c
> index d697f7a..2553adc 100644
> --- a/util.c
> +++ b/util.c
> @@ -678,3 +678,80 @@ struct vma_area *alloc_vma_area(void)
>  
>  	return p;
>  }
> +
> +int mkdirp(const char *path)
> +{
> +	size_t i;
> +	char made_path[PATH_MAX], *pos;
> +
> +	if (strlen(path) >= PATH_MAX) {
> +		pr_err("path %s is longer than PATH_MAX", path);
> +		return -1;
> +	}
> +
> +	strcpy(made_path, path);
> +
> +	i = 0;
> +	if (made_path[0] == '/')
> +		i++;
> +
> +	for (; i < strlen(made_path); i++) {
> +		pos = strchr(made_path + i, '/');
> +		if (pos)
> +			*pos = '\0';
> +		if (mkdir(made_path, 0755) < 0 && errno != EEXIST) {
> +			pr_perror("couldn't mkdirpat directory\n");
> +			return -1;
> +		}
> +		if (pos) {
> +			*pos = '/';
> +			i = pos - made_path;
> +		} else
> +			break;
> +	}
> +
> +	return 0;
> +}
> +
> +bool is_path_prefix(const char *path, const char *prefix)
> +{
> +	if (strstartswith(path, prefix)) {
> +		size_t len = strlen(prefix);
> +		switch (path[len]) {
> +		case '\0':
> +		case '/':
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +FILE *fopenat(int dirfd, char *path, char *cflags)
> +{
> +	int tmp, flags = 0;
> +	char *iter;
> +
> +	for (iter = cflags; *iter; iter++) {
> +		switch (*iter) {
> +		case 'r':
> +			flags |= O_RDONLY;
> +			break;
> +		case 'a':
> +			flags |= O_APPEND;
> +			break;
> +		case 'w':
> +			flags |= O_WRONLY | O_CREAT;
> +			break;
> +		case '+':
> +			flags = O_RDWR | O_CREAT;
> +			break;
> +		}
> +	}
> +
> +	tmp = openat(dirfd, path, flags, S_IRUSR | S_IWUSR);
> +	if (tmp < 0)
> +		return NULL;
> +
> +	return fdopen(tmp, cflags);
> +}
> -- 
> 1.9.1
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list