[Devel] [PATCH 1/2 v1] cgroup: 'release_agent' property is now per-cgroup instead of per-mount.

Tue Mar 17 17:06:44 MSK 2020

On 17.03.2020 13:28, Valeriy Vdovin wrote:
> Until now, 'release_agent' was a single property for the whole hierarchy
> of cgroups in the same mounted subsystem. After mounting a subsystem
> this property was represented by a single 'release_agent' file located
> in the root cgroup's directoy. Container concept assumes that multiple
> virtual cgroup root's breaking single mount - single root relationship,
> single mount can have as many cgroup roots as there are containers's on
> the system. As an example of the end user of 'release_agent' file is
> systemd, that sets it's own binary by writing to this file at setup
> step. The use case may not be limited by systemd only, so other
> userspace scenarios might want to override 'release_agent' from their
> containers.
> 
> 'release_agent' field is moved from 'struct cgroupfs_root'
> which is a mount-wide cgroup options holder, to 'struct cgroup' itself.
> Former logic to only show 'release_agent' in root cgroup only is
> preserved but is extended to also support virtual roots by adding this
> file to cgroup directory, when it becomes marked as VE_ROOT at
> ve_start_container time.
> 
> Drawbacks:
> 1.
> Because release_agent is not only a cgroup property but also a cgroup
> mount option, userspace can set this two ways:
>  a) echo 'binary_path' > ..../cgroup/release_agent.
>  b) mount -t cgroup .... -o release_agent=binary_path'
> 
> First is totally ok with our solution, second is not. We are not
> changing the mount API for cgroups, so we leave the mount option as a
> way of setting release_agent, at mount time we read this mount option
> and put it into release_agent field of the top cgroup, which is the real
> root cgroup for this mount.
> 
> When remounting cgroup kernel code will try to compose mount options
> from the current mount and for release agent, it will take the value
> from the top cgroup.
> 
> When the user is host and he want's to know mount options by typing
> 'mount' in the command line for example, we read release_agent back from
> the top cgroup.
> 
> When the user is in container, we read release_agent from the VE_ROOT
> cgroup.
> 
> Here is a problem: although there is only one mount, the user can see
> different mount options for it. Bad thing will happen when the user will
> decide to remount the cgroup, he can compose '-o' options by using
> container release_agent and if it is different from the host's
> (top-cgroup), then remount will fail, it forbids remount with varying
> value of release_agent.
> 
> We ignore this problem as OK, because container processes are not allowed
> to remount cgroups.
> 
> 2.
> Host user reading '../cgroup/{VE-CGROUP}/release_agent' file for a particular
> container will get the binary path in container-local form, not the actual path
> on the host filesystem.
> This issue is minor and no valid reasoning found to make it the other way.
> 
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

The whole patch is big. Can we split it in some smaller peaces?
On the first sight I see at least changing cgroup_mark_ve_root()
type changing may be moved in a separate patch. Maybe something
else...

I'm not sure I understand full synchronization scheme you used,
please, see the comments below.

> ---
>  include/linux/cgroup.h |   9 ++--
>  kernel/cgroup.c        | 127 ++++++++++++++++++++++++++++++++++++++++++-------
>  kernel/ve/ve.c         |   8 +++-
>  3 files changed, 123 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index aa91e47..cad5b4f 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -285,6 +285,12 @@ struct cgroup {
>  	/* directory xattrs */
>  	struct simple_xattrs xattrs;
>  	u64 subgroups_limit;
> +
> +	/*
> +	 * Per-cgroup path to release agent binary for release
> +	 * notifications.
> +	 */
> +	const char *release_agent_path;
>  };
>  
>  #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -384,9 +390,6 @@ struct cgroupfs_root {
>  	/* IDs for cgroups in this hierarchy */
>  	struct ida cgroup_ida;
>  
> -	/* The path to use for release notifications. */
> -	char release_agent_path[PATH_MAX];
> -
>  	/* The name for this hierarchy - may be empty */
>  	char name[MAX_CGROUP_ROOT_NAMELEN];
>  };
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 9ca8af9..0b64d88 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1038,8 +1038,14 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  {
>  	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
>  	struct cgroup_subsys *ss;
> +	struct cgroup *root_cgrp = &root->top_cgroup;
> +
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +#endif
>  
>  	mutex_lock(&cgroup_root_mutex);
> +
>  	for_each_subsys(root, ss)
>  		seq_printf(seq, ",%s", ss->name);
>  	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
> @@ -1050,9 +1056,18 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  		seq_puts(seq, ",xattr");
>  	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
>  		seq_puts(seq, ",cpuset_v2_mode");
> -	if (strlen(root->release_agent_path))
> +#ifdef CONFIG_VE
> +	if (!ve_is_super(ve)) {
> +		mutex_lock(&cgroup_mutex);
> +		root_cgrp = task_cgroup_from_root(ve->init_task, root);
> +		mutex_unlock(&cgroup_mutex);
> +	}
> +#endif
> +	if (root_cgrp && root_cgrp->release_agent_path &&
> +		strlen(root_cgrp->release_agent_path)) {

Why not "root_cgrp->release_agent_path[0] != 0"?

>  		seq_show_option(seq, "release_agent",
> -				root->release_agent_path);
> +				root_cgrp->release_agent_path);
> +	}
>  	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
>  		seq_puts(seq, ",clone_children");
>  	if (strlen(root->name))
> @@ -1275,6 +1290,26 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask)
>  	}
>  }
>  
> +static int cgroup_set_release_agent_locked(struct cgroup *cgrp,
> +					   const char *release_agent)
> +{
> +	size_t len;
> +	char *path;
> +
> +	len = strlen(release_agent);
> +	if (len >= PATH_MAX)
> +		return -EINVAL;
> +
> +	path = kstrdup(release_agent, GFP_KERNEL);
> +	if (!path)
> +		return -ENOMEM;
> +
> +	kfree(cgrp->release_agent_path);
> +
> +	cgrp->release_agent_path = path;

Which mutex does protect cgrp->release_agent_path?

I see cgroup_mutex taken from here:

cgroup_release_agent_write()->cgroup_set_release_agent_locked()

And cgroup_root_mutex taken from cgroup_show_options().

> +	return 0;
> +}
> +
>  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>  {
>  	int ret = 0;
> @@ -1331,7 +1366,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>  	cgroup_populate_dir(cgrp, false, added_mask);
>  
>  	if (opts.release_agent)
> -		strcpy(root->release_agent_path, opts.release_agent);
> +		ret = cgroup_set_release_agent_locked(cgrp, opts.release_agent);
>   out_unlock:
>  	kfree(opts.release_agent);
>  	kfree(opts.name);
> @@ -1493,7 +1528,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>  	root->flags = opts->flags;
>  	ida_init(&root->cgroup_ida);
>  	if (opts->release_agent)
> -		strcpy(root->release_agent_path, opts->release_agent);
> +		cgroup_set_release_agent_locked(&root->top_cgroup,
> +			opts->release_agent);
>  	if (opts->name)
>  		strcpy(root->name, opts->name);
>  	if (opts->cpuset_clone_children)
> @@ -1867,6 +1903,30 @@ int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
>  	return __cgroup_path(cgrp, buf, buflen, !ve_is_super(ve) && !ve->is_pseudosuper);
>  }
>  
> +static inline struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> +	/*
> +	 * Find nearest root cgroup, which might be host cgroup root
> +	 * or ve cgroup root.
> +	 *
> +	 *    <host_root_cgroup> -> local_root
> +	 *     \                    ^
> +	 *      <cgroup>            |
> +	 *       \                  |
> +	 *        <cgroup>   --->   from here
> +	 *        \
> +	 *         <ve_root_cgroup> -> local_root
> +	 *         \                   ^
> +	 *          <cgroup>           |
> +	 *          \                  |
> +	 *           <cgroup>  --->    from here
> +	 */
> +	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> +		cgrp = cgrp->parent;
> +
> +	return cgrp;
> +}
> +
>  /*
>   * Control Group taskset
>   */
> @@ -2258,19 +2318,16 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
>  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>  				      const char *buffer)
>  {
> -	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
> -
> +	int ret;
>  	if (strlen(buffer) >= PATH_MAX)
>  		return -EINVAL;
>  
>  	if (!cgroup_lock_live_group(cgrp))
>  		return -ENODEV;
>  
> -	mutex_lock(&cgroup_root_mutex);
> -	strcpy(cgrp->root->release_agent_path, buffer);
> -	mutex_unlock(&cgroup_root_mutex);
> +	ret = cgroup_set_release_agent_locked(cgrp, buffer);
>  	mutex_unlock(&cgroup_mutex);
> -	return 0;
> +	return ret;
>  }
>  
>  static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
> @@ -2278,7 +2335,8 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
>  {
>  	if (!cgroup_lock_live_group(cgrp))
>  		return -ENODEV;
> -	seq_puts(seq, cgrp->root->release_agent_path);
> +	if (cgrp->release_agent_path)
> +		seq_puts(seq, cgrp->release_agent_path);
>  	seq_putc(seq, '\n');
>  	mutex_unlock(&cgroup_mutex);
>  	return 0;
> @@ -4094,7 +4152,7 @@ static struct cftype files[] = {
>  	},
>  	{
>  		.name = "release_agent",
> -		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>  		.read_seq_string = cgroup_release_agent_show,
>  		.write_string = cgroup_release_agent_write,
>  		.max_write_len = PATH_MAX,
> @@ -4223,21 +4281,52 @@ static int subgroups_count(struct cgroup *cgroup)
>  	return cgrps_count;
>  }
>  
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> +	struct cftype *cft;
> +	for (cft = files; cft->name[0] != '\0'; cft++) {
> +		if (!strcmp(cft->name, name))
> +			return cft;
> +	}
> +	return NULL;
> +}
> +
> +static int cgroup_add_file_on_mark_ve(struct cgroup *ve_root)
> +{
> +	int err = 0;
> +	struct dentry *dir = ve_root->dentry;
> +	struct cftype *cft = get_cftype_by_name("release_agent");
> +	BUG_ON(!cft);
> +
> +	mutex_lock(&dir->d_inode->i_mutex);
> +	err = cgroup_add_file(ve_root, NULL, cft);
> +	mutex_unlock(&dir->d_inode->i_mutex);
> +	if (err) {
> +		pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
> +			cft->name, err);
> +	}
> +	return err;
> +}
> +
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_root(struct ve_struct *ve)
> +int cgroup_mark_ve_root(struct ve_struct *ve)
>  {
>  	struct cgroup *cgrp;
>  	struct cgroupfs_root *root;
> +	int err = 0;
>  
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
>  		cgrp = task_cgroup_from_root(ve->init_task, root);
>  		set_bit(CGRP_VE_ROOT, &cgrp->flags);
> -
> +		err = cgroup_add_file_on_mark_ve(cgrp);
> +		if (err)
> +			break;
>  		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>  			link_ve_root_cpu_cgroup(cgrp);
>  	}
>  	mutex_unlock(&cgroup_mutex);
> +	return err;
>  }
>  
>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
> @@ -4556,6 +4645,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  	}
>  	spin_unlock(&cgrp->event_list_lock);
>  
> +	kfree(cgrp->release_agent_path);
> +
>  	return 0;
>  };
>  
> @@ -5366,6 +5457,7 @@ static void cgroup_release_agent(struct work_struct *work)
>  		char *argv[3], *envp[3];
>  		int i, err;
>  		char *pathbuf = NULL, *agentbuf = NULL;
> +		struct cgroup *root_cgrp;
>  		struct cgroup *cgrp = list_entry(release_list.next,
>  						    struct cgroup,
>  						    release_list);
> @@ -5374,9 +5466,12 @@ static void cgroup_release_agent(struct work_struct *work)
>  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  		if (!pathbuf)
>  			goto continue_free;
> -		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> +		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
>  			goto continue_free;
> -		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
> +		root_cgrp = cgroup_get_local_root(cgrp);
> +		if (root_cgrp->release_agent_path)
> +			agentbuf = kstrdup(root_cgrp->release_agent_path,
> +				GFP_KERNEL);
>  		if (!agentbuf)
>  			goto continue_free;
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index ad3a698..a64b4a7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -479,7 +479,7 @@ static void ve_drop_context(struct ve_struct *ve)
>  
>  static const struct timespec zero_time = { };
>  
> -extern void cgroup_mark_ve_root(struct ve_struct *ve);
> +extern int cgroup_mark_ve_root(struct ve_struct *ve);
>  
>  /* under ve->op_sem write-lock */
>  static int ve_start_container(struct ve_struct *ve)
> @@ -528,7 +528,9 @@ static int ve_start_container(struct ve_struct *ve)
>  	if (err < 0)
>  		goto err_iterate;
>  
> -	cgroup_mark_ve_root(ve);
> +	err = cgroup_mark_ve_root(ve);
> +	if (err)
> +		goto err_mark_ve;
>  
>  	ve->is_running = 1;
>  
> @@ -538,6 +540,8 @@ static int ve_start_container(struct ve_struct *ve)
>  
>  	return 0;
>  
> +err_mark_ve:
> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  err_iterate:
>  	ve_stop_umh(ve);
>  err_umh:
>