[Devel] [PATCH 4/7] ve/cgroup: Moved cgroup release notifications to per-ve workqueues.

Thu Apr 2 13:00:16 MSK 2020

On 4/1/20 6:41 PM, Valeriy Vdovin wrote:
> Notifications about cgroups getting empty should be executed from
> inside of a VE, that owns this cgroup. This patch splits logic
> of notifications execution between ve code and cgroups code. VE is
> now responsible to store the list of cgroups that want to notify
> about themselves. VE also now manages the execution of release_agent
> work under it's domain, so the usermode process is spawned in this
> VE's namespaces.
> VE is now responsible to decline possibility of scheduling release_agent
> works when it is stopping.
> cgroup code is still responsible to manage setting/getting release_agent
> path parameter and pass it between mounts, but now it's also
> responsible to store owning ve as a filed in cgroup struct.
> 
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>   include/linux/cgroup.h |  26 +++++++++
>   include/linux/ve.h     |  17 ++++++
>   kernel/cgroup.c        | 156 +++++++++++++++++++++++++++++++++----------------
>   kernel/ve/ve.c         |  93 ++++++++++++++++++++++++++++-
>   4 files changed, 238 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b0b8b9e..86d3e5d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -315,6 +315,9 @@ struct cgroup {
>   	/* directory xattrs */
>   	struct simple_xattrs xattrs;
>   	u64 subgroups_limit;
> +
> +	/* ve_owner, responsible for running release agent. */
> +	struct ve_struct *ve_owner;
>   };
>   
>   #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -466,6 +469,27 @@ struct css_set {
>   };
>   
>   /*
> + * refcounted get/put for css_set objects
> + */
> +
> +void __put_css_set(struct css_set *cg, int taskexit);
> +
> +static inline void get_css_set(struct css_set *cg)
> +{
> +	atomic_inc(&cg->refcount);
> +}
> +
> +static inline void put_css_set(struct css_set *cg)
> +{
> +	__put_css_set(cg, 0);
> +}
> +
> +static inline void put_css_set_taskexit(struct css_set *cg)
> +{
> +	__put_css_set(cg, 1);
> +}
> +
> +/*
>    * cgroup_map_cb is an abstract callback API for reporting map-valued
>    * control files
>    */
> @@ -636,9 +660,11 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
>   int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
>   
>   int cgroup_task_count(const struct cgroup *cgrp);
> +void cgroup_release_agent(struct work_struct *work);
>   
>   #ifdef CONFIG_VE
>   void cgroup_mark_ve_root(struct ve_struct *ve);
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
>   #endif
>   
>   /*
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 92ec8f9..fcdb25a 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -126,7 +126,22 @@ struct ve_struct {
>   #endif
>   	struct kmapset_key	sysfs_perms_key;
>   
> +	/*
> +	 * cgroups, that want to notify about becoming
> +	 * empty, are linked to this release_list.
> +	 */
> +	struct list_head	release_list;
> +	struct raw_spinlock	release_list_lock;
> +
>   	struct workqueue_struct	*wq;
> +	struct work_struct	release_agent_work;
> +
> +	/*
> +	 * All tasks, that belong to this ve, live
> +	 * in cgroups, that are children to cgroups
> +	 * that form this css_set.
> +	 */
> +	struct css_set __rcu	*root_css_set;
>   };
>   
>   struct ve_devmnt {
> @@ -190,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
>   }
>   void do_update_load_avg_ve(void);
>   
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp);
>   extern struct ve_struct *get_ve(struct ve_struct *ve);
>   extern void put_ve(struct ve_struct *ve);
>   
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3df28a2..680ec8e 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -271,10 +271,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
>   
>   /* the list of cgroups eligible for automatic release. Protected by
>    * release_list_lock */
> -static LIST_HEAD(release_list);
> -static DEFINE_RAW_SPINLOCK(release_list_lock);
> -static void cgroup_release_agent(struct work_struct *work);
> -static DECLARE_WORK(release_agent_work, cgroup_release_agent);
>   static void check_for_release(struct cgroup *cgrp);
>   
>   /* Link structure for associating css_set objects with cgroups */
> @@ -335,7 +331,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
>    * compiled into their kernel but not actually in use */
>   static int use_task_css_set_links __read_mostly;
>   
> -static void __put_css_set(struct css_set *cg, int taskexit)
> +void __put_css_set(struct css_set *cg, int taskexit)
>   {
>   	struct cg_cgroup_link *link;
>   	struct cg_cgroup_link *saved_link;
> @@ -384,24 +380,6 @@ static void __put_css_set(struct css_set *cg, int taskexit)
>   }
>   
>   /*
> - * refcounted get/put for css_set objects
> - */
> -static inline void get_css_set(struct css_set *cg)
> -{
> -	atomic_inc(&cg->refcount);
> -}
> -
> -static inline void put_css_set(struct css_set *cg)
> -{
> -	__put_css_set(cg, 0);
> -}
> -
> -static inline void put_css_set_taskexit(struct css_set *cg)
> -{
> -	__put_css_set(cg, 1);
> -}
> -
> -/*
>    * compare_css_sets - helper function for find_existing_css_set().
>    * @cg: candidate css_set being tested
>    * @old_cg: existing css_set for a task
> @@ -654,6 +632,35 @@ static struct css_set *find_css_set(
>   	return res;
>   }
>   
> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
> +					    struct cgroupfs_root *root)
> +{
> +	struct cgroup *res = NULL;
> +
> +	BUG_ON(!mutex_is_locked(&cgroup_mutex));
> +	read_lock(&css_set_lock);
> +	/*
> +	 * No need to lock the task - since we hold cgroup_mutex the
> +	 * task can't change groups, so the only thing that can happen
> +	 * is that it exits and its css is set back to init_css_set.
> +	 */
> +	if (css_set == &init_css_set) {
> +		res = &root->top_cgroup;
> +	} else {
> +		struct cg_cgroup_link *link;
> +		list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> +			struct cgroup *c = link->cgrp;
> +			if (c->root == root) {
> +				res = c;
> +				break;
> +			}
> +		}
> +	}
> +	read_unlock(&css_set_lock);
> +	BUG_ON(!res);
> +	return res;
> +}
> +
>   /*
>    * Return the cgroup for "task" from the given hierarchy. Must be
>    * called with cgroup_mutex held.
> @@ -796,6 +803,31 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
>   	return name;
>   }
>   
> +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> +	/*
> +	 * Find nearest root cgroup, which might be host cgroup root
> +	 * or ve cgroup root.
> +	 *
> +	 *    <host_root_cgroup> -> local_root
> +	 *     \                    ^
> +	 *      <cgroup>            |
> +	 *       \                  |
> +	 *        <cgroup>   --->   from here
> +	 *        \
> +	 *         <ve_root_cgroup> -> local_root
> +	 *         \                   ^
> +	 *          <cgroup>           |
> +	 *          \                  |
> +	 *           <cgroup>  --->    from here
> +	 */
> +	lockdep_assert_held(&cgroup_mutex);
> +	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> +		cgrp = cgrp->parent;
> +
> +	return cgrp;
> +}
> +
>   static void cgroup_free_fn(struct work_struct *work)
>   {
>   	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
> @@ -1638,6 +1670,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
>   		const struct cred *cred;
>   		int i;
>   		struct css_set *cg;
> +		root_cgrp->ve_owner = get_exec_env();
>   
>   		BUG_ON(sb->s_root != NULL);
>   
> @@ -4320,6 +4353,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
>   	mutex_lock(&cgroup_mutex);
>   	for_each_active_root(root) {
>   		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp->ve_owner = ve;
>   		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>   
>   		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> @@ -4328,6 +4362,20 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
>   	mutex_unlock(&cgroup_mutex);
>   }
>   
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
> +{
> +	struct cgroup *cgrp;
> +	struct cgroupfs_root *root;
> +
> +	mutex_lock(&cgroup_mutex);
> +	for_each_active_root(root) {
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		BUG_ON(!cgrp->ve_owner);
> +		cgrp->ve_owner = NULL;
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
>   struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>   {
>   	struct cgroup *ve_root = NULL;
> @@ -4564,6 +4612,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>   	struct cgroup_event *event, *tmp;
>   	struct cgroup_subsys *ss;
>   	struct cgroup *child;
> +	struct cgroup *root_cgrp;
>   	bool empty;
>   
>   	lockdep_assert_held(&d->d_inode->i_mutex);
> @@ -4620,11 +4669,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>   
>   	set_bit(CGRP_REMOVED, &cgrp->flags);
>   
> -	raw_spin_lock(&release_list_lock);
> -	if (!list_empty(&cgrp->release_list))
> -		list_del_init(&cgrp->release_list);
> -	raw_spin_unlock(&release_list_lock);
> -
> +	root_cgrp = cgroup_get_local_root(cgrp);
> +	if (root_cgrp->ve_owner)
> +		ve_rm_from_release_list(root_cgrp->ve_owner, cgrp);
>   	/*
>   	 * Remove @cgrp directory.  The removal puts the base ref but we
>   	 * aren't quite done with @cgrp yet, so hold onto it.
> @@ -5399,6 +5446,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>   
>   static void check_for_release(struct cgroup *cgrp)
>   {
> +	struct cgroup *root_cgrp;
>   	/* All of these checks rely on RCU to keep the cgroup
>   	 * structure alive */
>   	if (cgroup_is_releasable(cgrp) &&
> @@ -5408,17 +5456,10 @@ static void check_for_release(struct cgroup *cgrp)
>   		 * already queued for a userspace notification, queue
>   		 * it now
>   		 */
> -		int need_schedule_work = 0;
> -
> -		raw_spin_lock(&release_list_lock);
> -		if (!cgroup_is_removed(cgrp) &&
> -		    list_empty(&cgrp->release_list)) {
> -			list_add(&cgrp->release_list, &release_list);
> -			need_schedule_work = 1;
> -		}
> -		raw_spin_unlock(&release_list_lock);
> -		if (need_schedule_work)
> -			schedule_work(&release_agent_work);
> +		root_cgrp = cgroup_get_local_root(cgrp);
> +		BUG_ON(!root_cgrp);
> +		if (root_cgrp->ve_owner)
> +			ve_add_to_release_list(root_cgrp->ve_owner, cgrp);
>   	}
>   }
>   
> @@ -5445,24 +5486,35 @@ static void check_for_release(struct cgroup *cgrp)
>    * this routine has no use for the exit status of the release agent
>    * task, so no sense holding our caller up for that.
>    */
> -static void cgroup_release_agent(struct work_struct *work)
> +void cgroup_release_agent(struct work_struct *work)
>   {
> -	BUG_ON(work != &release_agent_work);
> +	struct ve_struct *ve;
> +	ve = container_of(work, struct ve_struct, release_agent_work);

> +	down_read(&ve->op_sem);
> +	if (!ve->is_running) {
> +		up_read(&ve->op_sem);
> +		return;
> +	}
> +	up_read(&ve->op_sem);

Can you please explain why we have a lock for a single is_running check? 
It can switch to zero imediately after up_read.

>   	mutex_lock(&cgroup_mutex);
> -	raw_spin_lock(&release_list_lock);
> -	while (!list_empty(&release_list)) {
> +	raw_spin_lock(&ve->release_list_lock);
> +	while (!list_empty(&ve->release_list)) {
>   		char *argv[3], *envp[3];
>   		int i, err;
> +		const char *release_agent;
>   		char *pathbuf = NULL, *agentbuf = NULL;
> -		struct cgroup *cgrp = list_entry(release_list.next,
> -						    struct cgroup,
> -						    release_list);
> +		struct cgroup *cgrp, *root_cgrp;
> +
> +		cgrp = list_entry(ve->release_list.next,
> +				  struct cgroup,
> +				  release_list);
> +
>   		list_del_init(&cgrp->release_list);
> -		raw_spin_unlock(&release_list_lock);
> +		raw_spin_unlock(&ve->release_list_lock);
>   		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>   		if (!pathbuf)
>   			goto continue_free;
> -		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> +		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
>   			goto continue_free;
>   		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
>   		if (!agentbuf)
> @@ -5483,7 +5535,9 @@ static void cgroup_release_agent(struct work_struct *work)
>   		 * since the exec could involve hitting disk and hence
>   		 * be a slow process */
>   		mutex_unlock(&cgroup_mutex);
> -		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +
> +		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
>   		if (err < 0)
>   			pr_warn_ratelimited("cgroup release_agent "
>   					    "%s %s failed: %d\n",
> @@ -5493,9 +5547,9 @@ static void cgroup_release_agent(struct work_struct *work)
>    continue_free:
>   		kfree(pathbuf);
>   		kfree(agentbuf);
> -		raw_spin_lock(&release_list_lock);
> +		raw_spin_lock(&ve->release_list_lock);
>   	}
> -	raw_spin_unlock(&release_list_lock);
> +	raw_spin_unlock(&ve->release_list_lock);
>   	mutex_unlock(&cgroup_mutex);
>   }
>   
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 587f445..7eff0e7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -87,6 +87,10 @@ struct ve_struct ve0 = {
>   	.netif_max_nr		= INT_MAX,
>   	.arp_neigh_nr		= ATOMIC_INIT(0),
>   	.nd_neigh_nr		= ATOMIC_INIT(0),
> +	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(ve0.release_list_lock),
> +	.release_list		= LIST_HEAD_INIT(ve0.release_list),
> +	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
> +					cgroup_release_agent),
>   };
>   EXPORT_SYMBOL(ve0);
>   
> @@ -453,6 +457,9 @@ static void ve_grab_context(struct ve_struct *ve)
>   
>   	get_task_struct(tsk);
>   	ve->init_task = tsk;
> +	ve->root_css_set = tsk->cgroups;
> +	get_css_set(ve->root_css_set);
> +
>   	ve->init_cred = (struct cred *)get_current_cred();
>   	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
>   	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
> @@ -465,6 +472,9 @@ static void ve_drop_context(struct ve_struct *ve)
>   	put_net(ve->ve_netns);
>   	ve->ve_netns = NULL;
>   
> +	put_css_set_taskexit(ve->root_css_set);
> +	ve->root_css_set = NULL;
> +
>   	/* Allows to dereference init_cred and init_task if ve_ns is set */
>   	rcu_assign_pointer(ve->ve_ns, NULL);
>   	synchronize_rcu();
> @@ -477,7 +487,6 @@ static void ve_drop_context(struct ve_struct *ve)
>   
>   	put_task_struct(ve->init_task);
>   	ve->init_task = NULL;
> -
>   }
>   
>   static const struct timespec zero_time = { };
> @@ -504,6 +513,53 @@ static void ve_workqueue_stop(struct ve_struct *ve)
>   	destroy_workqueue(ve->wq);
>   }
>   
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> +	int need_schedule_work = 0;
> +
> +	down_write(&ve->op_sem);
> +
> +	if (!ve->is_running)
> +		goto out_unlock;
> +
> +	/*
> +	 * Guarantee dependency is provided like so:
> +	 * - release_list_lock is valid due to ve != NULL.
> +	 * - is_running = 1 is valid due to ve->op_sem is held.
> +	 * - ve->op_sem is valid due to ve != NULL.
> +	 * - ve != NULL should be guaranteed by the caller.
> +	 */
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!cgroup_is_removed(cgrp) &&
> +	    list_empty(&cgrp->release_list)) {
> +		list_add(&cgrp->release_list, &ve->release_list);
> +		need_schedule_work = 1;
> +	}
> +	raw_spin_unlock(&ve->release_list_lock);
> +	/*
> +	 * Because we are under ve->op_sem lock, ve->is_running will
> +	 * be set only after this is already scheduled to ve->wq.
> +	 * Guaranteed to wait until this work is flushed before
> +	 * proceeding with ve_stop_ns.
> +	 */
> +	if (need_schedule_work)
> +		queue_work(ve->wq, &ve->release_agent_work);
> +
> +out_unlock:
> +	up_write(&ve->op_sem);
> +}
> +
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> +	/*
> +	 * There is no is_runnig check here, because the caller is
> +	 * is more flexible in using ve->op_sem lock from above.
> +	 */
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!list_empty(&cgrp->release_list))
> +		list_del_init(&cgrp->release_list);
> +	raw_spin_unlock(&ve->release_list_lock);
> +}
>   
>   /* under ve->op_sem write-lock */
>   static int ve_start_container(struct ve_struct *ve)
> @@ -579,6 +635,8 @@ err_list:
>   	return err;
>   }
>   
> +extern void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
> +
>   void ve_stop_ns(struct pid_namespace *pid_ns)
>   {
>   	struct ve_struct *ve = current->task_ve;
> @@ -590,19 +648,44 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>   	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>   		return;
>   
> -	wait_khelpers();
> -
>   	down_write(&ve->op_sem);
> +
> +	/*
> +	 * Zero-out cgroup->ve_owner for all ve roots.
> +	 * After this is done cgroups will not be able to
> +	 * pass through check_for_release into this ve's
> +	 * release_list.
> +	 */
> +	cgroup_unbind_roots_from_ve(ve);
> +
>   	/*
>   	 * Here the VE changes its state into "not running".
>   	 * op_sem works as barrier for vzctl ioctls.
>   	 * ve_mutex works as barrier for ve_can_attach().
> +	 * is_running = 0 also blocks further insertions to
> +	 * release_list.
>   	 */
>   	ve->is_running = 0;
>   
> +	up_write(&ve->op_sem);
> +
> +	/*
> +	 * Stopping workqueue flushes all the tasks there as well as
> +	 * disables all further work inserions into it. The ordering
> +	 * relative to wait_khelpers does not matter.
> +	 */
>   	ve_workqueue_stop(ve);
>   
>   	/*
> +	 * Wait until all calls to call_usermodehelper_by
> +	 * finish to ensure that any usermode tasks launched before
> +	 * ve->is_running was set to 0 has finished after this line.
> +	 */
> +	wait_khelpers();

The call to wait_khelpers wants to wait finishing of all 
call_usermodehelper_by calls started before ve->init_task->flags was set 
to PF_EXITING.

As you mentioned in slack, we actually should wait not all 
call_usermodehelper_by but only once related to current ve. But it does 
not depend on is_running as you write in comment AFAICS.

> +
> +	down_write(&ve->op_sem);
> +
> +	/*
>   	 * Neither it can be in pseudosuper state
>   	 * anymore, setup it again if needed.
>   	 */
> @@ -698,6 +781,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>   	if (!ve->ve_name)
>   		goto err_name;
>   
> +	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
> +	raw_spin_lock_init(&ve->release_list_lock);
> +
>   	ve->_randomize_va_space = ve0._randomize_va_space;
>   
>   	ve->features = VE_FEATURES_DEF;
> @@ -732,6 +818,7 @@ do_init:
>   	INIT_LIST_HEAD(&ve->devices);
>   	INIT_LIST_HEAD(&ve->ve_list);
>   	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->release_list);
>   	mutex_init(&ve->devmnt_mutex);
>   
>   #ifdef CONFIG_AIO
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.