[Devel] [PATCH 1/2 v0] cgroup: 'release_agent' property is now per-cgroup instead of per-mount.
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Fri Mar 13 16:16:21 MSK 2020
On 3/13/20 1:09 PM, Valeriy Vdovin wrote:
> Until now 'release_agent' was a single property for the whole hierarchy
> of cgroups in the same mounted subsystem. After mounting a subsystem
> this property was represented by a single 'release_agent' file located
> in the root cgroup's directoy. Container concept assumes that multiple
> virtual cgroup root's breaking single mount - single root relationship,
> single mount can have as many cgroup roots as there are containers's on
> the system. As an example of the end user of 'release_agent' file is
> systemd, that sets it's own binary by writing to this file at setup
> step. The use case may not be limited by systemd only, so other
> userspace scenarios might want to override 'release_agent' from their
> containers.
>
> 'release_agent' field is moved from 'struct cgroupfs_root'
> which is a mount-wide cgroup options holder, to 'struct cgroup' itself.
> Former logic to only show 'release_agent' in root cgroup only is
> preserved but is extended to also support virtual roots by adding this
> file to cgroup directory, when it becomes marked as VE_ROOT at
> ve_start_container time.
>
> Drawbacks:
> 1.
> Because release_agent is not only a cgroup property but also a cgroup
> mount option, userspace can set this two ways:
> a) echo 'binary_path' > ..../cgroup/release_agent.
> b) mount -t cgroup .... -o release_agent=binary_path'
>
> First is totally ok with our solution, second is not. We are not
> changing the mount API for cgroups, so we leave the mount option as a
> way of setting release_agent, at mount time we read this mount option
> and put it into release_agent field of the top cgroup, which is the real
> root cgroup for this mount.
>
> When remounting cgroup kernel code will try to compose mount options
> from the current mount and for release agent, it will take the value
> from the top cgroup.
>
> When the user is host and he want's to know mount options by typing
> 'mount' in the command line for example, we read release_agent back from
> the top cgroup.
>
> When the user is in container, we read release_agent from the VE_ROOT
> cgroup.
>
> Here is a problem: although there is only one mount, the user can see
> different mount options for it. Bad thing will happen when the user will
> decide to remount the cgroup, he can compose '-o' options by using
> container release_agent and if it is different from the host's
> (top-cgroup), then remount will fail, it forbids remount with varying
> value of release_agent.
>
> We ignore this problem as OK, because container processes are not allowed
> to remount cgroups.
>
> 2.
> Host user reading '../cgroup/{VE-CGROUP}/release_agent' file for a particular
> container will get the binary path in container-local form, not the actual path
> on the host filesystem.
> This issue is minor and no valid reasoning found to make it the other way.
>
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> include/linux/cgroup.h | 9 ++--
> kernel/cgroup.c | 127 ++++++++++++++++++++++++++++++++++++++++++-------
> kernel/ve/ve.c | 8 +++-
> 3 files changed, 123 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index aa91e47..cad5b4f 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -285,6 +285,12 @@ struct cgroup {
> /* directory xattrs */
> struct simple_xattrs xattrs;
> u64 subgroups_limit;
> +
> + /*
> + * Per-cgroup path to release agent binary for release
> + * notifications.
> + */
> + const char *release_agent_path;
> };
>
> #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -384,9 +390,6 @@ struct cgroupfs_root {
> /* IDs for cgroups in this hierarchy */
> struct ida cgroup_ida;
>
> - /* The path to use for release notifications. */
> - char release_agent_path[PATH_MAX];
> -
> /* The name for this hierarchy - may be empty */
> char name[MAX_CGROUP_ROOT_NAMELEN];
> };
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 9ca8af9..0b64d88 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1038,8 +1038,14 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
> {
> struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
> struct cgroup_subsys *ss;
> + struct cgroup *root_cgrp = &root->top_cgroup;
> +
> +#ifdef CONFIG_VE
> + struct ve_struct *ve = get_exec_env();
> +#endif
>
> mutex_lock(&cgroup_root_mutex);
> +
> for_each_subsys(root, ss)
> seq_printf(seq, ",%s", ss->name);
> if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
> @@ -1050,9 +1056,18 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
> seq_puts(seq, ",xattr");
> if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
> seq_puts(seq, ",cpuset_v2_mode");
> - if (strlen(root->release_agent_path))
> +#ifdef CONFIG_VE
> + if (!ve_is_super(ve)) {
> + mutex_lock(&cgroup_mutex);
> + root_cgrp = task_cgroup_from_root(ve->init_task, root);
This would be likely broken if in CT you do:
mkdir /sys/fs/cgroup/systemd/temporary
echo 1 > /sys/fs/cgroup/systemd/temporary/tasks
Your root_cgroup will link to "temporary". Maybe you also need to use
cgroup_get_local_root() here?
> + mutex_unlock(&cgroup_mutex);
> + }
> +#endif
> + if (root_cgrp && root_cgrp->release_agent_path &&
> + strlen(root_cgrp->release_agent_path)) {
> seq_show_option(seq, "release_agent",
> - root->release_agent_path);
> + root_cgrp->release_agent_path);
> + }
> if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
> seq_puts(seq, ",clone_children");
> if (strlen(root->name))
> @@ -1275,6 +1290,26 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask)
> }
> }
>
> +static int cgroup_set_release_agent_locked(struct cgroup *cgrp,
> + const char *release_agent)
> +{
> + size_t len;
> + char *path;
> +
> + len = strlen(release_agent);
> + if (len >= PATH_MAX)
> + return -EINVAL;
> +
> + path = kstrdup(release_agent, GFP_KERNEL);
> + if (!path)
> + return -ENOMEM;
> +
> + kfree(cgrp->release_agent_path);
> +
> + cgrp->release_agent_path = path;
> + return 0;
> +}
> +
> static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> {
> int ret = 0;
> @@ -1331,7 +1366,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> cgroup_populate_dir(cgrp, false, added_mask);
>
> if (opts.release_agent)
> - strcpy(root->release_agent_path, opts.release_agent);
> + ret = cgroup_set_release_agent_locked(cgrp, opts.release_agent);
> out_unlock:
> kfree(opts.release_agent);
> kfree(opts.name);
> @@ -1493,7 +1528,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> root->flags = opts->flags;
> ida_init(&root->cgroup_ida);
> if (opts->release_agent)
> - strcpy(root->release_agent_path, opts->release_agent);
> + cgroup_set_release_agent_locked(&root->top_cgroup,
> + opts->release_agent);
> if (opts->name)
> strcpy(root->name, opts->name);
> if (opts->cpuset_clone_children)
> @@ -1867,6 +1903,30 @@ int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
> return __cgroup_path(cgrp, buf, buflen, !ve_is_super(ve) && !ve->is_pseudosuper);
> }
>
> +static inline struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> + /*
> + * Find nearest root cgroup, which might be host cgroup root
> + * or ve cgroup root.
> + *
> + * <host_root_cgroup> -> local_root
> + * \ ^
> + * <cgroup> |
> + * \ |
> + * <cgroup> ---> from here
> + * \
> + * <ve_root_cgroup> -> local_root
> + * \ ^
> + * <cgroup> |
> + * \ |
> + * <cgroup> ---> from here
> + */
> + while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> + cgrp = cgrp->parent;
> +
> + return cgrp;
> +}
> +
> /*
> * Control Group taskset
> */
> @@ -2258,19 +2318,16 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
> static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
> const char *buffer)
> {
> - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
> -
> + int ret;
> if (strlen(buffer) >= PATH_MAX)
> return -EINVAL;
>
> if (!cgroup_lock_live_group(cgrp))
> return -ENODEV;
>
> - mutex_lock(&cgroup_root_mutex);
> - strcpy(cgrp->root->release_agent_path, buffer);
> - mutex_unlock(&cgroup_root_mutex);
> + ret = cgroup_set_release_agent_locked(cgrp, buffer);
> mutex_unlock(&cgroup_mutex);
> - return 0;
> + return ret;
> }
>
> static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
> @@ -2278,7 +2335,8 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
> {
> if (!cgroup_lock_live_group(cgrp))
> return -ENODEV;
> - seq_puts(seq, cgrp->root->release_agent_path);
> + if (cgrp->release_agent_path)
> + seq_puts(seq, cgrp->release_agent_path);
> seq_putc(seq, '\n');
> mutex_unlock(&cgroup_mutex);
> return 0;
> @@ -4094,7 +4152,7 @@ static struct cftype files[] = {
> },
> {
> .name = "release_agent",
> - .flags = CFTYPE_ONLY_ON_ROOT,
> + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
> .read_seq_string = cgroup_release_agent_show,
> .write_string = cgroup_release_agent_write,
> .max_write_len = PATH_MAX,
> @@ -4223,21 +4281,52 @@ static int subgroups_count(struct cgroup *cgroup)
> return cgrps_count;
> }
>
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> + struct cftype *cft;
> + for (cft = files; cft->name[0] != '\0'; cft++) {
> + if (!strcmp(cft->name, name))
> + return cft;
> + }
> + return NULL;
> +}
> +
> +static int cgroup_add_file_on_mark_ve(struct cgroup *ve_root)
> +{
> + int err = 0;
> + struct dentry *dir = ve_root->dentry;
> + struct cftype *cft = get_cftype_by_name("release_agent");
> + BUG_ON(!cft);
> +
> + mutex_lock(&dir->d_inode->i_mutex);
> + err = cgroup_add_file(ve_root, NULL, cft);
> + mutex_unlock(&dir->d_inode->i_mutex);
> + if (err) {
> + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
> + cft->name, err);
> + }
> + return err;
> +}
> +
> #ifdef CONFIG_VE
> -void cgroup_mark_ve_root(struct ve_struct *ve)
> +int cgroup_mark_ve_root(struct ve_struct *ve)
> {
> struct cgroup *cgrp;
> struct cgroupfs_root *root;
> + int err = 0;
>
> mutex_lock(&cgroup_mutex);
> for_each_active_root(root) {
> cgrp = task_cgroup_from_root(ve->init_task, root);
> set_bit(CGRP_VE_ROOT, &cgrp->flags);
> -
> + err = cgroup_add_file_on_mark_ve(cgrp);
> + if (err)
> + break;
> if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> link_ve_root_cpu_cgroup(cgrp);
> }
> mutex_unlock(&cgroup_mutex);
> + return err;
> }
>
> struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
> @@ -4556,6 +4645,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
> }
> spin_unlock(&cgrp->event_list_lock);
>
> + kfree(cgrp->release_agent_path);
> +
> return 0;
> };
>
> @@ -5366,6 +5457,7 @@ static void cgroup_release_agent(struct work_struct *work)
> char *argv[3], *envp[3];
> int i, err;
> char *pathbuf = NULL, *agentbuf = NULL;
> + struct cgroup *root_cgrp;
> struct cgroup *cgrp = list_entry(release_list.next,
> struct cgroup,
> release_list);
> @@ -5374,9 +5466,12 @@ static void cgroup_release_agent(struct work_struct *work)
> pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> if (!pathbuf)
> goto continue_free;
> - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> + if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
> goto continue_free;
> - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
> + root_cgrp = cgroup_get_local_root(cgrp);
> + if (root_cgrp->release_agent_path)
> + agentbuf = kstrdup(root_cgrp->release_agent_path,
> + GFP_KERNEL);
> if (!agentbuf)
> goto continue_free;
>
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index ad3a698..a64b4a7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -479,7 +479,7 @@ static void ve_drop_context(struct ve_struct *ve)
>
> static const struct timespec zero_time = { };
>
> -extern void cgroup_mark_ve_root(struct ve_struct *ve);
> +extern int cgroup_mark_ve_root(struct ve_struct *ve);
>
> /* under ve->op_sem write-lock */
> static int ve_start_container(struct ve_struct *ve)
> @@ -528,7 +528,9 @@ static int ve_start_container(struct ve_struct *ve)
> if (err < 0)
> goto err_iterate;
>
> - cgroup_mark_ve_root(ve);
> + err = cgroup_mark_ve_root(ve);
> + if (err)
> + goto err_mark_ve;
>
> ve->is_running = 1;
>
> @@ -538,6 +540,8 @@ static int ve_start_container(struct ve_struct *ve)
>
> return 0;
>
> +err_mark_ve:
> + ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> err_iterate:
> ve_stop_umh(ve);
> err_umh:
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list