[CRIU] [PATCH] Attempt to restore cgroups

Tue Jul 1 13:12:10 PDT 2014

On 07/01/2014 05:36 PM, Tycho Andersen wrote:
> During the dump phase, /proc/cgroups is parsed to find co-mounted cgroups.
> Then, for each task /proc/self/cgroup is parsed for the cgroups that it is a
> member of, and that cgroup is traversed to find any child cgroups which may
> also need restoring. All of this information is persisted along with the
> original cg_sets, which indicate which cgroups a task is a member of.
> 
> On restore, an initial phase creates all the cgroups which were saved and
> attempts to restore any peroperties they had. Then the tasks are restored into
> their respective cgroups via cg_sets as usual.

Thanks for doing this :) Some my thoughts are inline.

> Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
> ---
>  cgroup.c              | 555 +++++++++++++++++++++++++++++++++++++++++++++++---
>  cr-dump.c             |   6 +
>  include/cgroup.h      |  43 ++++
>  include/proc_parse.h  |   2 +
>  include/util.h        |   5 +
>  proc_parse.c          |   1 +
>  protobuf/cgroup.proto |  26 ++-
>  util.c                |  26 +++
>  8 files changed, 629 insertions(+), 35 deletions(-)
> 
> diff --git a/cgroup.c b/cgroup.c
> index 9bf9e45..c980456 100644
> --- a/cgroup.c
> +++ b/cgroup.c
> @@ -28,6 +30,12 @@ struct cg_set {
>  	struct list_head	ctls;
>  };
>  

Please, write a short comment about each new structure
you introduce.

> +struct cgroup_mount {
> +	struct list_head	l;
> +	char			*controller;
> +	char			*path;
> +};
> +
>  static LIST_HEAD(cg_sets);
>  static unsigned int n_sets;
>  static CgSetEntry **rst_sets;
> @@ -118,6 +130,357 @@ static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls)

> +struct cgroup_mount *find_cg_mount(const char *controller)
> +{
> +	struct cgroup_mount *cur;
> +	list_for_each_entry(cur, &cgroup_mounts, l) {
> +		if (strcmp(controller, cur->controller) == 0)
> +			return cur;
> +	}
> +
> +	return NULL;
> +}
> +
> +int parse_cg_mountinfo()

The proc parsing routines are better to be placed in the proc_parse.c file.

> +{
> +	FILE *f = fopen("/proc/mounts", "r");

We have the /proc/pid/mountinfo parsing routine ready. Can it be re-used
for this purpose?

> +	char buf[1024];
> +
> +	while(fgets(buf, 1024, f)) {
> +		char path[PATH_MAX], fs[1024], opts[1024];
> +		struct cgroup_mount *cgm;
> +
> +		if (sscanf(buf, "%*s %s %s %s %*d %*d", path, fs, opts) != 3)
> +			return 0;
> +
> +		if (strcmp(fs, "cgroup") == 0) {
> +			char *slash;
> +
> +			slash = strrchr(path, '/');
> +			if (!slash) {
> +				pr_err("bad path %s\n", path);
> +				return -1;
> +			}
> +
> +			cgm = xmalloc(sizeof(*cgm));
> +			if (!cgm)
> +				return -1;
> +
> +
> +			*slash = '\0';
> +			cgm->path = xstrdup(path);
> +			if (!cgm->path) {
> +				xfree(cgm);
> +				return -1;
> +			}
> +
> +			cgm->controller = xstrdup(slash + 1);
> +			if (!cgm->controller) {
> +				xfree(cgm->path);
> +				xfree(cgm);
> +				return -1;
> +			}
> +
> +			list_add_tail(&cgm->l, &cgroup_mounts);
> +		}
> +	}

The f is left opened after this routine. We try not to initialize
variables during declaration with function calls, only constants.
This helps a lot in spotting such things early.

> +
> +	return 0;
> +}
> +
> +/* Parse and create all the real controllers. This does not include things with
> + * the "name=" prefix, e.g. systemd.
> + */
> +int parse_cgroups()
> +{
> +	FILE *f = fopen("/proc/cgroups", "r");
> +	char buf[1024], name[1024];
> +	uint32_t heirarchy;

The hierarchy thing doesn't go directly into images,
to it can be just int. This would make the sscanf
below look nicer.

> +	struct cg_controller *cur = NULL;
> +
> +	if (!f) {
> +		pr_perror("failed opening /proc/cgroups");
> +		return -1;
> +	}
> +
> +	/* throw away the header */
> +	if (!fgets(buf, 1024, f))
> +		return 0;
> +
> +	while(fgets(buf, 1024, f))
> +	{
> +		char *n;
> +		char found = 0;
> +
> +		sscanf(buf, "%s" "%" SCNu32, name, &heirarchy);
> +		list_for_each_entry(cur, &cgroups, l) {
> +			if (cur->heirarchy == heirarchy)
> +			{
> +				void *m;
> +
> +				found = 1;
> +				cur->n_controllers++;
> +				m = xrealloc(cur->controllers, sizeof(char*) * cur->n_controllers);
> +				if(!m)
> +					return -1;
> +
> +				cur->controllers = m;
> +				if (!cur->controllers)
> +					return -1;
> +
> +				n = xstrdup(name);
> +				if (!n)
> +					return -1;
> +
> +				cur->controllers[cur->n_controllers-1] = n;
> +				break;
> +			}
> +		}
> +
> +		if (!found)
> +		{
> +			struct cg_controller *nc = new_controller(name, heirarchy);
> +			if (!nc)
> +				return -1;
> +			list_add_tail(&nc->l, &cur->l);
> +			n_cgroups++;
> +		}
> +	}

Same here -- f is not fclose()-d.

> +
> +	return 0;
> +}
> +
> +static struct cg_controller	*current_controller;
> +static char			*current_controller_name;
> +
> +#define EXACT_MATCH	0
> +#define PARENT_MATCH	1
> +#define NO_MATCH	2
> +
> +static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir)
> +{
> +	struct cgroup_dir *d;
> +	list_for_each_entry(d, dirs, siblings) {
> +		pr_info("%s %s\n", d->path, path);
> +		if (strcmp(d->path, path) == 0) {
> +			*rdir = d;
> +			return EXACT_MATCH;
> +		}
> +
> +		if (strncmp(d->path, path, strlen(d->path))) {

Can strstartswith() help here?

> +			int ret = find_dir(path, &d->children, rdir);
> +			if (ret == NO_MATCH) {
> +				*rdir = d;
> +				return PARENT_MATCH;
> +			}
> +			return ret;
> +
> +		}
> +	}
> +
> +	return NO_MATCH;
> +}
> +
> +static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag)
> +{
> +	struct cgroup_dir *ncd = NULL, *match;
> +	int ret = 0;
> +	char pbuf[PATH_MAX], *name, *path;
> +
> +
> +	if (typeflag == FTW_D)
> +	{
> +		FILE *f;
> +		int mtype;
> +
> +		strncpy(pbuf, fpath, PATH_MAX);
> +
> +		pr_info("adding cgroup %s\n", fpath);
> +
> +		ncd = xmalloc(sizeof(*ncd));
> +		if(!ncd)
> +		{
> +			ret = -1;
> +			goto out;
> +		}
> +		ncd->path = NULL;
> +		pr_info("foo2\n");

Nice log message :)

> +
> +		name = strstr(fpath, current_controller_name);
> +		if (!name) {
> +			pr_err("%s has no name component (%s)\n", fpath, current_controller_name);
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		path = strchr(name, '/');
> +		if (!path) {
> +			pr_err("Bad path %s\n", name);
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		ncd->path = xstrdup(path);
> +		if (!ncd->path) {
> +			ret = -1;
> +			goto out;
> +		}
> +
> +		mtype = find_dir(path, &current_controller->heads, &match);
> +
> +		switch(mtype) {
> +			/* ignore co-mounted cgroups */
> +			case EXACT_MATCH :
> +				goto out;
> +			case PARENT_MATCH :
> +				list_add_tail(&ncd->siblings, &match->children);
> +				match->n_children++;
> +				break;
> +			case NO_MATCH :
> +				list_add_tail(&ncd->siblings, &current_controller->heads);
> +				current_controller->n_heads++;
> +				break;

If we have two directories -- /foo and /foo/bar and find the latter one first,
then both the /foo and the /foo/bar would just be added into the controller->heads
list, but only the /foo should, while /foo/bar should be in /foo's ->children.

> +		}
> +
> +		INIT_LIST_HEAD(&ncd->children);
> +		ncd->n_children = 0;
> +		ncd->controller = current_controller;
> +
> +		ncd->flags = 0;
> +
> +		snprintf(pbuf, PATH_MAX, "%s/memory.limit_in_bytes", fpath);
> +		f = fopen(pbuf, "r");
> +		if (f)
> +		{
> +			if (fscanf(f, "%" SCNu64, &ncd->mem_limit) != 1)
> +			{
> +				pr_err("Failed scanning %s\n", pbuf);
> +				ret = -1;
> +				goto out;
> +			}
> +			ncd->flags |= HAS_MEM_LIMIT;
> +			fclose(f);
> +		}
> +
> +		snprintf(pbuf, PATH_MAX, "%s/cpu.shares", fpath);
> +		f = fopen(pbuf, "r");
> +		if (f)
> +		{
> +			if (fscanf(f, "%" SCNu32, &ncd->cpu_shares) != 1)
> +			{
> +				pr_err("Failed scanning %s for u32\n", pbuf);
> +				ret = -1;
> +				goto out;
> +			}
> +			ncd->flags |= HAS_CPU_SHARES;
> +			fclose(f);
> +		}
> +
> +		return 0;
> +	}
> +
> +out:
> +	if (ncd) {
> +		if (ncd->path)
> +			xfree(ncd->path);
> +		xfree(ncd);
> +	}
> +
> +	return ret;
> +}
> +

> @@ -134,6 +497,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
>  	if (parse_task_cgroup(pid, &ctls, &n_ctls))
>  		return -1;
>  
> +	if (collect_cgroups(&ctls) < 0)

AFAIU you call the collect_cgroup() for every new cgset met in order to construct
the tree of directories potentially seen by the processes we dump. But since the
dump_sets() verify, that all such cgroups are subsets of the init task's one, would
it be easier just to scan the tree down starting from init task's dirs?

> +		return -1;
> +
>  	cs = get_cg_set(&ctls, n_ctls);
>  	if (!cs)
>  		return -1;
> @@ -152,6 +518,74 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
>  	return 0;
>  }
>  
> +int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents)

This one can be marked static.

> +{
> +	struct cgroup_dir *cur;
> +	CgroupDirEntry *cde;
> +	void *m;
> +	int i = 0;
> +
> +	m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry)));
> +	*ents = m;
> +	if (!m)
> +		return -1;
> +
> +	cde = m + n_dirs * sizeof(CgroupDirEntry *);

Just for the record -- we have the xptr_pull() helper that can help putting
objects one-by-one in the single-malloc-ed memory segment. You can find the
good (I hope)  example of it in the core_entry_alloc().

> +
> +	list_for_each_entry(cur, dirs, siblings) {
> +		cgroup_dir_entry__init(cde);
> +
> +		cde->path = cur->path;
> +		cde->has_mem_limit = cur->flags & HAS_MEM_LIMIT;
> +		cde->mem_limit = cur->mem_limit;
> +		cde->has_cpu_shares = cur->flags & HAS_CPU_SHARES;
> +		cde->cpu_shares = cur->cpu_shares;
> +
> +		cde->n_children = cur->n_children;
> +		if (cur->n_children > 0)
> +			if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children) < 0) {
> +				xfree(*ents);
> +				return -1;
> +			}
> +		(*ents)[i++] = cde++;
> +	}
> +
> +	return 0;
> +}
> +

> @@ -323,6 +759,58 @@ void fini_cgroup(void)
>  	xfree(cg_yard);
>  }
>  
> +static int prepare_cgroup_dirs(char* paux, size_t off, CgroupDirEntry **ents, size_t n_ents)
> +{
> +	size_t i, my_off;
> +	CgroupDirEntry *e;
> +
> +	for (i = 0; i < n_ents; i++)
> +	{
> +		e = ents[i];
> +
> +		my_off = sprintf(paux + off, "/%s", e->path);
> +
> +		if (mkdirp(paux)) {
> +			pr_perror("Can't make cgroup dir %s", paux);
> +			return -1;
> +		}
> +
> +		if (e->has_mem_limit) {
> +			FILE *f;
> +
> +			sprintf(paux + my_off + off, "/memory.limit_in_bytes");
> +
> +			f = fopen(paux, "w+");
> +			if (!f) {
> +				pr_perror("Couldn't open %s for writing\n", paux);
> +				return -1;
> +			}
> +
> +			fprintf(f, "%" SCNu64, e->mem_limit);
> +			fclose(f);

Some minor thing, that can be done later, but still.

In OpenVZ we restore all the limits only after all tasks are restored
and pushed into their cgroups. This is done for two reasons.

First, some controllers allow situations when the usage is higher than
the limit. E.g. the kmemcg will be such, this can happen when tasks eat
memory, and the admin lowers the limit. Kmem is mostly unshrinkable and
such cgroup will (well, may) just live and fail all new allocations.
Restoration inside pre-limited cgroup will be impossible.

And the 2nd reason is -- if we put too strict e.g. cpu limit this may
slow down the restore precess significantly. It's much better to restore
withing some larger limits and then tie them.

> +		}
> +
> +		if (e->has_cpu_shares) {
> +			FILE *f;
> +
> +			sprintf(paux + my_off + off, "/cpu.shares");
> +
> +			f = fopen(paux, "w+");
> +			if (!f) {
> +				pr_perror("Couldn't open %s for writing\n", paux);
> +				return -1;
> +			}
> +
> +			fprintf(f, "%" SCNu32, e->cpu_shares);
> +			fclose(f);
> +		}
> +
> +		prepare_cgroup_dirs(paux, off, e->children, e->n_children);
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * Prepare the CGROUP_YARD service descriptor. This guy is
>   * tmpfs mount with the set of ctl->name directories each

> diff --git a/include/proc_parse.h b/include/proc_parse.h
> index b153328..c41d664 100644
> --- a/include/proc_parse.h
> +++ b/include/proc_parse.h
> @@ -194,6 +195,7 @@ struct cg_ctl {
>  	struct list_head l;
>  	char *name;
>  	char *path;
> +	struct cgroup *cgroup;

This one is unused in the patch.

>  };
>  
>  /*

> index a497c70..19804a3 100644
> --- a/protobuf/cgroup.proto
> +++ b/protobuf/cgroup.proto
> @@ -1,13 +1,27 @@
> -message controller_entry {
> -	required string name	= 1;
> -	required string path	= 2;
> +message cgroup_dir_entry {
> +	required string 		path		= 1;
> +	optional uint64 		mem_limit 	= 2;
> +	optional uint32 		cpu_shares	= 3;
> +	repeated cgroup_dir_entry	children 	= 4;
> +}
> +
> +message cg_controller_entry {
> +	required uint32			id		= 1;
> +	repeated string			controllers	= 2;
> +	repeated cgroup_dir_entry	dirs		= 3;
> +}
> +
> +message cg_member_entry {
> +	required string		name		= 1;
> +	required string		path            = 2;
>  }
>  
>  message cg_set_entry {
> -	required uint32			id	= 1;
> -	repeated controller_entry	ctls	= 2;
> +	required uint32                 id      = 1;
> +	repeated cg_member_entry	ctls    = 3;

I don't mind renaming the controller_entry into cg_member_entry, but
can you do this as a separate patch?

>  }
>  
>  message cgroup_entry {
> -	repeated cg_set_entry	sets	= 1;
> +	repeated cg_set_entry		sets		= 1;
> +	repeated cg_controller_entry	controllers	= 2;
>  }