[Devel] [PATCH v12 08/12] ve/cgroup: moved release_agent from system_wq to per-ve workqueues

Kirill Tkhai ktkhai at virtuozzo.com
Mon Apr 20 12:56:24 MSK 2020


On 20.04.2020 06:42, Valeriy Vdovin wrote:
> Each VE should execute release agent notifications within it's own
> workqueue. This way we achieve a more fine-grained control over
> release_agent work flushing at VE destruction.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>  include/linux/cgroup.h |  1 +
>  include/linux/ve.h     | 10 ++++++++++
>  kernel/cgroup.c        | 53 ++++++++++++++++++++++----------------------------
>  kernel/ve/ve.c         | 44 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 78 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 7ed718d..d0ce3cc 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -636,6 +636,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
>  int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
>  
>  int cgroup_task_count(const struct cgroup *cgrp);
> +void cgroup_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
>  void cgroup_mark_ve_roots(struct ve_struct *ve);
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 02e00e9..787a96f 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -126,7 +126,15 @@ struct ve_struct {
>  #endif
>  	struct kmapset_key	sysfs_perms_key;
>  
> +	/*
> +	 * cgroups, that want to notify about becoming
> +	 * empty, are linked to this release_list.
> +	 */
> +	struct list_head	release_list;
> +	struct raw_spinlock	release_list_lock;
> +
>  	struct workqueue_struct	*wq;
> +	struct work_struct	release_agent_work;
>  
>  	/*
>  	 * All tasks, that belong to this ve, live
> @@ -197,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
>  }
>  void do_update_load_avg_ve(void);
>  
> +void ve_add_to_release_list(struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct cgroup *cgrp);
>  extern struct ve_struct *get_ve(struct ve_struct *ve);
>  extern void put_ve(struct ve_struct *ve);
>  
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index b19d749..80a4a01 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -269,10 +269,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
>  
>  /* the list of cgroups eligible for automatic release. Protected by
>   * release_list_lock */
> -static LIST_HEAD(release_list);
> -static DEFINE_RAW_SPINLOCK(release_list_lock);
> -static void cgroup_release_agent(struct work_struct *work);
> -static DECLARE_WORK(release_agent_work, cgroup_release_agent);
>  static void check_for_release(struct cgroup *cgrp);
>  
>  /* Link structure for associating css_set objects with cgroups */
> @@ -4316,6 +4312,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  	mutex_unlock(&cgroup_mutex);
>  	/* ve_owner == NULL will be visible */
>  	synchronize_rcu();
> +
> +	/*
> +	 * Anyone already waiting in this wq to execute
> +	 * cgroup_release_agent doesn't know that ve_owner is NULL,
> +	 * but we can wait for all of them at flush_workqueue.
> +	 * After it is complete no other cgroup can seep through
> +	 * to this ve's workqueue, so it's safe to shutdown ve.
> +	 */
> +	flush_workqueue(ve->wq);
>  }
>  
>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
> @@ -4610,11 +4615,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  
>  	set_bit(CGRP_REMOVED, &cgrp->flags);
>  
> -	raw_spin_lock(&release_list_lock);
> -	if (!list_empty(&cgrp->release_list))
> -		list_del_init(&cgrp->release_list);
> -	raw_spin_unlock(&release_list_lock);
> -
> +	ve_rm_from_release_list(cgrp);
>  	/*
>  	 * Remove @cgrp directory.  The removal puts the base ref but we
>  	 * aren't quite done with @cgrp yet, so hold onto it.
> @@ -5398,17 +5399,7 @@ static void check_for_release(struct cgroup *cgrp)
>  		 * already queued for a userspace notification, queue
>  		 * it now
>  		 */
> -		int need_schedule_work = 0;
> -
> -		raw_spin_lock(&release_list_lock);
> -		if (!cgroup_is_removed(cgrp) &&
> -		    list_empty(&cgrp->release_list)) {
> -			list_add(&cgrp->release_list, &release_list);
> -			need_schedule_work = 1;
> -		}
> -		raw_spin_unlock(&release_list_lock);
> -		if (need_schedule_work)
> -			schedule_work(&release_agent_work);
> +		ve_add_to_release_list(cgrp);
>  	}
>  }
>  
> @@ -5435,20 +5426,22 @@ static void check_for_release(struct cgroup *cgrp)
>   * this routine has no use for the exit status of the release agent
>   * task, so no sense holding our caller up for that.
>   */
> -static void cgroup_release_agent(struct work_struct *work)
> +void cgroup_release_agent(struct work_struct *work)
>  {
> -	BUG_ON(work != &release_agent_work);
>  	mutex_lock(&cgroup_mutex);
> -	raw_spin_lock(&release_list_lock);
> -	while (!list_empty(&release_list)) {
> +	raw_spin_lock(&ve->release_list_lock);
> +	while (!list_empty(&ve->release_list)) {
>  		char *argv[3], *envp[3];
>  		int i, err;
>  		char *pathbuf = NULL, *agentbuf = NULL;
> -		struct cgroup *cgrp = list_entry(release_list.next,
> -						    struct cgroup,
> -						    release_list);
> +		struct cgroup *cgrp;
> +
> +		cgrp = list_entry(ve->release_list.next,
> +				  struct cgroup,
> +				  release_list);
> +
>  		list_del_init(&cgrp->release_list);
> -		raw_spin_unlock(&release_list_lock);
> +		raw_spin_unlock(&ve->release_list_lock);
>  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  		if (!pathbuf)
>  			goto continue_free;
> @@ -5483,9 +5476,9 @@ static void cgroup_release_agent(struct work_struct *work)
>   continue_free:
>  		kfree(pathbuf);
>  		kfree(agentbuf);
> -		raw_spin_lock(&release_list_lock);
> +		raw_spin_lock(&ve->release_list_lock);
>  	}
> -	raw_spin_unlock(&release_list_lock);
> +	raw_spin_unlock(&ve->release_list_lock);
>  	mutex_unlock(&cgroup_mutex);
>  }
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index db3b600..659f19d 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -87,6 +87,11 @@ struct ve_struct ve0 = {
>  	.netif_max_nr		= INT_MAX,
>  	.arp_neigh_nr		= ATOMIC_INIT(0),
>  	.nd_neigh_nr		= ATOMIC_INIT(0),
> +	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
> +					ve0.release_list_lock),
> +	.release_list		= LIST_HEAD_INIT(ve0.release_list),
> +	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
> +					cgroup_release_agent),
>  };
>  EXPORT_SYMBOL(ve0);
>  
> @@ -498,6 +503,41 @@ static void ve_workqueue_stop(struct ve_struct *ve)
>  	destroy_workqueue(ve->wq);
>  }
>  
> +void ve_add_to_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	int need_schedule_work = 0;
> +
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!cgroup_is_removed(cgrp) &&
> +	    list_empty(&cgrp->release_list)) {
> +		list_add(&cgrp->release_list, &ve->release_list);
> +		need_schedule_work = 1;
> +	}
> +	raw_spin_unlock(&ve->release_list_lock);
> +
> +	if (need_schedule_work)
> +		queue_work(ve->wq, &ve->release_agent_work);
> +
> +	rcu_read_unlock();
> +}
> +
> +void ve_rm_from_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +	rcu_read_unlock();
> +
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!list_empty(&cgrp->release_list))
> +		list_del_init(&cgrp->release_list);
> +	raw_spin_unlock(&ve->release_list_lock);


Shouldn't rcu_read_unlock() be made here instead of above?
Something should guarantee ve is alive till ve access its memory.

> +}
> +
>  /* under ve->op_sem write-lock */
>  static int ve_start_container(struct ve_struct *ve)
>  {
> @@ -693,6 +733,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>  	if (!ve->ve_name)
>  		goto err_name;
>  
> +	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
> +	raw_spin_lock_init(&ve->release_list_lock);
> +
>  	ve->_randomize_va_space = ve0._randomize_va_space;
>  
>  	ve->features = VE_FEATURES_DEF;
> @@ -727,6 +770,7 @@ do_init:
>  	INIT_LIST_HEAD(&ve->devices);
>  	INIT_LIST_HEAD(&ve->ve_list);
>  	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->release_list);
>  	mutex_init(&ve->devmnt_mutex);
>  
>  #ifdef CONFIG_AIO
> 



More information about the Devel mailing list