[Devel] [PATCH RHEL7 v21 12/14] ve/cgroup: added release_agent to each container root cgroup.

Kirill Tkhai ktkhai at virtuozzo.com
Fri Jul 31 11:24:22 MSK 2020


On 28.07.2020 20:53, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
>   Normally all cgroup files are created during a call to cgroup_create
>   by cgroup_populate_dir function. It creates or not creates all cgroup
>   files once and they immediately become visible to userspace as
>   filesystem objects.
>   Due to specifics of container creation process, it is not possible to
>   use the same code for 'release_agent' file creation. For VE to start
>   operating, first a list of ordinary cgroups is being created for
>   each subsystem, then the set of newly created cgroups are converted to
>   "virtual roots", so at the time when cgroup_create is executed, there
>   is no knowledge of wheather or not "release_agent" file should be
>   created. This information only comes at "convertion" step which is
>   'cgroup_mark_ve_roots' function. As the file is created dynamically
>   in a live cgroup, a rather delicate locking sequence is present in
>   the new code:
>     - each new "virtual root" cgroup will have to add "release_agent" file,
>       thus each cgroup's directory would need to be locked during
>       the insertion time by cgroup->dentry->d_inode->i_mutex.
>     - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>       (see cgroup_mount/cgroup_remount). They can not be locked in order
>       {lock(cgroup_mutex), lock(inode->i_mutex)}.
>     - to collect a list of cgroups, that need to become virtual we need
>       cgroup_mutex lock to iterate active roots.
>     - to overcome the above conflict we first need to collect a list of
>       all virtual cgroups under cgroup_mutex lock, then release it and
>       after that to insert "release_agent" to each root under
>       inode->i_mutex lock.
>     - to collect a list of cgroups on stack we utilize
>       cgroup->cft_q_node, made specially for that purpose under it's own
>       cgroup_cft_mutex.
> 
> Destruction:
>   Destruction is done in reverse from the above within
>   cgroup_unmark_ve_root.
>   After file destruction we must prevent further write operations to
>   this file in case when someone has opened this file prior to VE
>   and cgroup destruction. This is achieved by checking if cgroup
>   in the argument to cgroup_file_write function has features of host
>   or virtual root.
> 
> https://jira.sw.ru/browse/PSBM-83887
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>  include/linux/cgroup.h |  2 +-
>  include/linux/ve.h     |  2 +-
>  kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
>  kernel/ve/ve.c         |  6 +++-
>  4 files changed, 87 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index fc138c0..645c9fd 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>  void cgroup_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>  void cgroup_unmark_ve_roots(struct ve_struct *ve);
>  struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>  #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b6662637..5bf275f 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>  void ve_add_to_release_list(struct cgroup *cgrp);
>  void ve_rm_from_release_list(struct cgroup *cgrp);
>  
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
>  	const char *release_agent);
>  
>  const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 1d9c889..bb77804 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>  	if (!cgroup_lock_live_group(cgrp))
>  		return -ENODEV;
>  
> +	/*
> +	 * Call to cgroup_get_local_root is a way to make sure
> +	 * that cgrp in the argument is valid "virtual root"
> +	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> +	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
> +	 * the file was still opened by other process and
> +	 * that we shouldn't allow further writings via that file.
> +	 */
>  	root_cgrp = cgroup_get_local_root(cgrp);
>  	BUG_ON(!root_cgrp);
> +
> +	if (root_cgrp != cgrp) {
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
>  	if (root_cgrp->ve_owner)
>  		ret = ve_set_release_agent_path(root_cgrp, buffer);
>  	else
>  		return -ENODEV;
>  
> +out:
>  	mutex_unlock(&cgroup_mutex);
>  	return ret;
>  }
> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>  	},
>  	{
>  		.name = "release_agent",
> -		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>  		.read_seq_string = cgroup_release_agent_show,
>  		.write_string = cgroup_release_agent_write,
>  		.max_write_len = PATH_MAX,
> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>  	return cgrps_count;
>  }
>  
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> +	struct cftype *cft;
> +	for (cft = files; cft->name[0] != '\0'; cft++) {
> +		if (!strcmp(cft->name, name))
> +			return cft;
> +	}
> +	return NULL;
> +}
> +
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>  {
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
>  	struct cgroupfs_root *root;
> +	int err = 0;
> +	struct cftype *cft;
> +	LIST_HEAD(pending);
>  
> +	cft = get_cftype_by_name("release_agent");
> +	BUG_ON(!cft);
> +
> +	mutex_lock(&cgroup_cft_mutex);
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
> -		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>  		rcu_assign_pointer(cgrp->ve_owner, ve);
>  		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>  
> +		dget(cgrp->dentry);
> +		list_add_tail(&cgrp->cft_q_node, &pending);
>  		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>  			link_ve_root_cpu_cgroup(cgrp);
>  	}
>  	mutex_unlock(&cgroup_mutex);
>  	synchronize_rcu();
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		struct inode *inode = cgrp->dentry->d_inode;
> +
> +		if (err) {
> +			list_del_init(&cgrp->cft_q_node);
> +			dput(cgrp->dentry);
> +			continue;
> +		}

How does this work?

If cgroup_add_file() sets error on previous iteration, then here we do double dput()?

> +
> +		mutex_lock(&inode->i_mutex);
> +		mutex_lock(&cgroup_mutex);
> +		if (!cgroup_is_removed(cgrp))
> +			err = cgroup_add_file(cgrp, NULL, cft);
> +		mutex_unlock(&cgroup_mutex);
> +		mutex_unlock(&inode->i_mutex);
> +
> +		list_del_init(&cgrp->cft_q_node);
> +		dput(cgrp->dentry);
> +	}
> +	mutex_unlock(&cgroup_cft_mutex);
> +
> +	if (err)
> +		cgroup_unmark_ve_roots(ve);
> +
> +	return err;
>  }
>  
>  void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  {
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
>  	struct cgroupfs_root *root;
> +	struct cftype *cft;
> +	LIST_HEAD(pending);
> +
> +	cft = get_cftype_by_name("release_agent");
>  
> +	mutex_lock(&cgroup_cft_mutex);
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
>  		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		dget(cgrp->dentry);
> +		list_add_tail(&cgrp->cft_q_node, &pending);
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		struct inode *inode = cgrp->dentry->d_inode;
> +		mutex_lock(&inode->i_mutex);
> +		mutex_lock(&cgroup_mutex);
> +		cgroup_rm_file(cgrp, cft);
>  		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>  				lockdep_is_held(&cgroup_mutex)));
>  		rcu_assign_pointer(cgrp->ve_owner, NULL);
>  		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +		mutex_unlock(&cgroup_mutex);
> +		mutex_unlock(&inode->i_mutex);
>  	}
> -	mutex_unlock(&cgroup_mutex);
> +	mutex_unlock(&cgroup_cft_mutex);
>  	/* ve_owner == NULL will be visible */
>  	synchronize_rcu();
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index f03f665..8d78270 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>  	if (err < 0)
>  		goto err_iterate;
>  
> -	cgroup_mark_ve_roots(ve);
> +	err = cgroup_mark_ve_roots(ve);
> +	if (err)
> +		goto err_mark_ve;
>  
>  	ve->is_running = 1;
>  
> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>  
>  	return 0;
>  
> +err_mark_ve:
> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  err_iterate:
>  	ve_workqueue_stop(ve);
>  err_workqueue:
> 



More information about the Devel mailing list