[Devel] [PATCH RHEL7 v21 12/14] ve/cgroup: added release_agent to each container root cgroup.
Kirill Tkhai
ktkhai at virtuozzo.com
Fri Jul 31 11:24:22 MSK 2020
On 28.07.2020 20:53, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
> Normally all cgroup files are created during a call to cgroup_create
> by cgroup_populate_dir function. It creates or not creates all cgroup
> files once and they immediately become visible to userspace as
> filesystem objects.
> Due to specifics of container creation process, it is not possible to
> use the same code for 'release_agent' file creation. For VE to start
> operating, first a list of ordinary cgroups is being created for
> each subsystem, then the set of newly created cgroups are converted to
> "virtual roots", so at the time when cgroup_create is executed, there
> is no knowledge of wheather or not "release_agent" file should be
> created. This information only comes at "convertion" step which is
> 'cgroup_mark_ve_roots' function. As the file is created dynamically
> in a live cgroup, a rather delicate locking sequence is present in
> the new code:
> - each new "virtual root" cgroup will have to add "release_agent" file,
> thus each cgroup's directory would need to be locked during
> the insertion time by cgroup->dentry->d_inode->i_mutex.
> - d_inode->i_mutex has an ordering dependency with cgroup_mutex
> (see cgroup_mount/cgroup_remount). They can not be locked in order
> {lock(cgroup_mutex), lock(inode->i_mutex)}.
> - to collect a list of cgroups, that need to become virtual we need
> cgroup_mutex lock to iterate active roots.
> - to overcome the above conflict we first need to collect a list of
> all virtual cgroups under cgroup_mutex lock, then release it and
> after that to insert "release_agent" to each root under
> inode->i_mutex lock.
> - to collect a list of cgroups on stack we utilize
> cgroup->cft_q_node, made specially for that purpose under it's own
> cgroup_cft_mutex.
>
> Destruction:
> Destruction is done in reverse from the above within
> cgroup_unmark_ve_root.
> After file destruction we must prevent further write operations to
> this file in case when someone has opened this file prior to VE
> and cgroup destruction. This is achieved by checking if cgroup
> in the argument to cgroup_file_write function has features of host
> or virtual root.
>
> https://jira.sw.ru/browse/PSBM-83887
>
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> include/linux/cgroup.h | 2 +-
> include/linux/ve.h | 2 +-
> kernel/cgroup.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
> kernel/ve/ve.c | 6 +++-
> 4 files changed, 87 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index fc138c0..645c9fd 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
> void cgroup_release_agent(struct work_struct *work);
>
> #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
> void cgroup_unmark_ve_roots(struct ve_struct *ve);
> struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
> #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b6662637..5bf275f 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
> void ve_add_to_release_list(struct cgroup *cgrp);
> void ve_rm_from_release_list(struct cgroup *cgrp);
>
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
> const char *release_agent);
>
> const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 1d9c889..bb77804 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
> if (!cgroup_lock_live_group(cgrp))
> return -ENODEV;
>
> + /*
> + * Call to cgroup_get_local_root is a way to make sure
> + * that cgrp in the argument is valid "virtual root"
> + * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> + * has already cleared CGRP_VE_ROOT flag from this cgroup while
> + * the file was still opened by other process and
> + * that we shouldn't allow further writings via that file.
> + */
> root_cgrp = cgroup_get_local_root(cgrp);
> BUG_ON(!root_cgrp);
> +
> + if (root_cgrp != cgrp) {
> + ret = -EPERM;
> + goto out;
> + }
> +
> if (root_cgrp->ve_owner)
> ret = ve_set_release_agent_path(root_cgrp, buffer);
> else
> return -ENODEV;
>
> +out:
> mutex_unlock(&cgroup_mutex);
> return ret;
> }
> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
> },
> {
> .name = "release_agent",
> - .flags = CFTYPE_ONLY_ON_ROOT,
> + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
> .read_seq_string = cgroup_release_agent_show,
> .write_string = cgroup_release_agent_write,
> .max_write_len = PATH_MAX,
> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
> return cgrps_count;
> }
>
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> + struct cftype *cft;
> + for (cft = files; cft->name[0] != '\0'; cft++) {
> + if (!strcmp(cft->name, name))
> + return cft;
> + }
> + return NULL;
> +}
> +
> #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
> {
> - struct cgroup *cgrp;
> + struct cgroup *cgrp, *tmp;
> struct cgroupfs_root *root;
> + int err = 0;
> + struct cftype *cft;
> + LIST_HEAD(pending);
>
> + cft = get_cftype_by_name("release_agent");
> + BUG_ON(!cft);
> +
> + mutex_lock(&cgroup_cft_mutex);
> mutex_lock(&cgroup_mutex);
> for_each_active_root(root) {
> - cgrp = task_cgroup_from_root(ve->init_task, root);
> + cgrp = css_cgroup_from_root(ve->root_css_set, root);
> rcu_assign_pointer(cgrp->ve_owner, ve);
> set_bit(CGRP_VE_ROOT, &cgrp->flags);
>
> + dget(cgrp->dentry);
> + list_add_tail(&cgrp->cft_q_node, &pending);
> if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> link_ve_root_cpu_cgroup(cgrp);
> }
> mutex_unlock(&cgroup_mutex);
> synchronize_rcu();
> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> + struct inode *inode = cgrp->dentry->d_inode;
> +
> + if (err) {
> + list_del_init(&cgrp->cft_q_node);
> + dput(cgrp->dentry);
> + continue;
> + }
How does this work?
If cgroup_add_file() sets error on previous iteration, then here we do double dput()?
> +
> + mutex_lock(&inode->i_mutex);
> + mutex_lock(&cgroup_mutex);
> + if (!cgroup_is_removed(cgrp))
> + err = cgroup_add_file(cgrp, NULL, cft);
> + mutex_unlock(&cgroup_mutex);
> + mutex_unlock(&inode->i_mutex);
> +
> + list_del_init(&cgrp->cft_q_node);
> + dput(cgrp->dentry);
> + }
> + mutex_unlock(&cgroup_cft_mutex);
> +
> + if (err)
> + cgroup_unmark_ve_roots(ve);
> +
> + return err;
> }
>
> void cgroup_unmark_ve_roots(struct ve_struct *ve)
> {
> - struct cgroup *cgrp;
> + struct cgroup *cgrp, *tmp;
> struct cgroupfs_root *root;
> + struct cftype *cft;
> + LIST_HEAD(pending);
> +
> + cft = get_cftype_by_name("release_agent");
>
> + mutex_lock(&cgroup_cft_mutex);
> mutex_lock(&cgroup_mutex);
> for_each_active_root(root) {
> cgrp = css_cgroup_from_root(ve->root_css_set, root);
> + dget(cgrp->dentry);
> + list_add_tail(&cgrp->cft_q_node, &pending);
> + }
> + mutex_unlock(&cgroup_mutex);
> + list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> + struct inode *inode = cgrp->dentry->d_inode;
> + mutex_lock(&inode->i_mutex);
> + mutex_lock(&cgroup_mutex);
> + cgroup_rm_file(cgrp, cft);
> BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
> lockdep_is_held(&cgroup_mutex)));
> rcu_assign_pointer(cgrp->ve_owner, NULL);
> clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> + mutex_unlock(&cgroup_mutex);
> + mutex_unlock(&inode->i_mutex);
> }
> - mutex_unlock(&cgroup_mutex);
> + mutex_unlock(&cgroup_cft_mutex);
> /* ve_owner == NULL will be visible */
> synchronize_rcu();
>
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index f03f665..8d78270 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
> if (err < 0)
> goto err_iterate;
>
> - cgroup_mark_ve_roots(ve);
> + err = cgroup_mark_ve_roots(ve);
> + if (err)
> + goto err_mark_ve;
>
> ve->is_running = 1;
>
> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>
> return 0;
>
> +err_mark_ve:
> + ve_hook_iterate_fini(VE_SS_CHAIN, ve);
> err_iterate:
> ve_workqueue_stop(ve);
> err_workqueue:
>
More information about the Devel
mailing list