[Devel] [PATCH VZ8 v1 09/14] ve/cgroup: moved release_agent from system_wq to per-ve workqueues

Mon Jan 25 15:29:19 MSK 2021

On 20.01.2021 12:56, Valeriy Vdovin wrote:
> Each VE should execute release agent notifications within it's own
> workqueue. This way we achieve a more fine-grained control over
> release_agent work flushing at VE destruction.
> 
> (Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c)
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

> ---
>  include/linux/cgroup-defs.h     |  10 ++-
>  include/linux/cgroup.h          |   2 +
>  include/linux/ve.h              |  10 +++
>  kernel/cgroup/cgroup-internal.h |   1 +
>  kernel/cgroup/cgroup-v1.c       | 109 ++++++++++++++++++++++++--------
>  kernel/cgroup/cgroup.c          |  12 +++-
>  kernel/ve/ve.c                  |  48 ++++++++++++++
>  7 files changed, 159 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index e497387872f4..d6fe95320819 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -453,6 +453,13 @@ struct cgroup {
>  	 */
>  	struct list_head cset_links;
>  
> +	/*
> +	 * Linked list running through all cgroups that can
> +	 * potentially be reaped by the release agent. Protected by
> +	 * release_list_lock
> +	 */
> +	struct list_head release_list;
> +
>  	/*
>  	 * On the default hierarchy, a css_set for a cgroup with some
>  	 * susbsys disabled will point to css's which are associated with
> @@ -490,9 +497,6 @@ struct cgroup {
>  	/* used to wait for offlining of csses */
>  	wait_queue_head_t offline_waitq;
>  
> -	/* used to schedule release agent */
> -	struct work_struct release_agent_work;
> -
>  	/* used to store eBPF programs */
>  	struct cgroup_bpf bpf;
>  
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index df9a9a09ce2a..0920b2ffb15b 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
>  int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>  		   struct cgroup_namespace *ns);
>  
> +void cgroup1_release_agent(struct work_struct *work);
> +
>  #ifdef CONFIG_VE
>  extern void cgroup_mark_ve_root(struct ve_struct *ve);
>  void cgroup_unmark_ve_roots(struct ve_struct *ve);
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index d3c1ab840444..2ab39b607708 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -105,7 +105,15 @@ struct ve_struct {
>  	unsigned long		aio_nr;
>  	unsigned long		aio_max_nr;
>  #endif
> +	/*
> +	 * cgroups, that want to notify about becoming
> +	 * empty, are linked to this release_list.
> +	 */
> +	struct list_head	release_list;
> +	spinlock_t		release_list_lock;
> +
>  	struct workqueue_struct	*wq;
> +	struct work_struct	release_agent_work;
>  };
>  
>  struct ve_devmnt {
> @@ -127,6 +135,8 @@ extern int nr_ve;
>  	(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
>  
>  #ifdef CONFIG_VE
> +void ve_add_to_release_list(struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct cgroup *cgrp);
>  extern struct ve_struct *get_ve(struct ve_struct *ve);
>  extern void put_ve(struct ve_struct *ve);
>  
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 829997989c41..4de66630d456 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock;
>  extern struct cgroup_subsys *cgroup_subsys[];
>  extern struct list_head cgroup_roots;
>  extern struct file_system_type cgroup_fs_type;
> +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp);
>  
>  /* iterate across the hierarchies */
>  #define for_each_root(root)						\
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 21a7c36fbf44..c1891317ae3a 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp)
>  {
>  	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
>  	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
> -		schedule_work(&cgrp->release_agent_work);
> +		ve_add_to_release_list(cgrp);
>  }
>  
>  /*
> @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp,
>   */
>  void cgroup1_release_agent(struct work_struct *work)
>  {
> -	struct cgroup *cgrp =
> -		container_of(work, struct cgroup, release_agent_work);
> -	char *pathbuf = NULL, *agentbuf = NULL;
> -	char *argv[3], *envp[3];
> -	int ret;
> +	struct ve_struct *ve;
> +	unsigned long flags;
> +	char *agentbuf;
> +
> +	agentbuf = kzalloc(PATH_MAX, GFP_KERNEL);
> +	if (!agentbuf) {
> +		pr_warn("failed to allocate agentbuf\n");
> +		return;
> +	}
>  
> +	ve = container_of(work, struct ve_struct, release_agent_work);
>  	mutex_lock(&cgroup_mutex);
> +	spin_lock_irqsave(&ve->release_list_lock, flags);
> +	while (!list_empty(&ve->release_list)) {
> +		char *argv[3], *envp[3];
> +		int i, err;
> +		char *pathbuf = NULL;
> +		struct cgroup *cgrp, *root_cgrp;
> +		const char *release_agent;
> +
> +		cgrp = list_entry(ve->release_list.next,
> +				  struct cgroup,
> +				  release_list);
> +		list_del_init(&cgrp->release_list);
> +		spin_unlock_irqrestore(&ve->release_list_lock, flags);
> +
> +		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> +		if (!pathbuf)
> +			goto continue_free;
> +		/*
> +		 * At VE destruction root cgroup looses VE_ROOT flag.
> +		 * Because of that 'cgroup_get_local_root' will not see
> +		 * VE root and return host's root cgroup instead.
> +		 * We can detect this because we have a pointer to
> +		 * original ve coming from work argument.
> +		 * We do not want to execute VE's notifications on host,
> +		 * so in this case we skip.
> +		 */
> +		rcu_read_lock();
> +		root_cgrp = cgroup_get_local_root(cgrp);
>  
> -	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> -	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
> -	if (!pathbuf || !agentbuf || !strlen(agentbuf))
> -		goto out;
> +		if (rcu_access_pointer(root_cgrp->ve_owner) != ve) {
> +			rcu_read_unlock();
> +			goto continue_free;
> +		}
>  
> -	spin_lock_irq(&css_set_lock);
> -	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
> -	spin_unlock_irq(&css_set_lock);
> -	if (ret < 0 || ret >= PATH_MAX)
> -		goto out;
> +		if (cgroup_path_ve_relative(root_cgrp, cgrp, pathbuf,
> +			PAGE_SIZE) < 0) {
> +			rcu_read_unlock();
> +			goto continue_free;
> +		}
>  
> -	argv[0] = agentbuf;
> -	argv[1] = pathbuf;
> -	argv[2] = NULL;
> +		release_agent = ve_get_release_agent_path(root_cgrp);
>  
> -	/* minimal command environment */
> -	envp[0] = "HOME=/";
> -	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
> -	envp[2] = NULL;
> +		*agentbuf = 0;
> +		if (release_agent)
> +			strncpy(agentbuf, release_agent, PATH_MAX);
> +		rcu_read_unlock();
>  
> +		if (!*agentbuf)
> +			goto continue_free;
> +
> +		i = 0;
> +		argv[i++] = agentbuf;
> +		argv[i++] = pathbuf;
> +		argv[i] = NULL;
> +
> +		i = 0;
> +		/* minimal command environment */
> +		envp[i++] = "HOME=/";
> +		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
> +		envp[i] = NULL;
> +
> +		mutex_unlock(&cgroup_mutex);
> +		err = call_usermodehelper_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC);
> +
> +		if (err < 0 && ve == &ve0)
> +			pr_warn_ratelimited("cgroup1_release_agent "
> +					    "%s %s failed: %d\n",
> +					    agentbuf, pathbuf, err);
> +		mutex_lock(&cgroup_mutex);
> +continue_free:
> +		kfree(pathbuf);
> +		spin_lock_irqsave(&ve->release_list_lock, flags);
> +	}
> +	spin_unlock_irqrestore(&ve->release_list_lock, flags);
>  	mutex_unlock(&cgroup_mutex);
> -	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> -	goto out_free;
> -out:
> -	mutex_unlock(&cgroup_mutex);
> -out_free:
>  	kfree(agentbuf);
> -	kfree(pathbuf);
>  }
>  
>  /*
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index ff0a803c3aad..e1d4663780d7 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1973,6 +1973,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  	spin_unlock_irq(&css_set_lock);
>  	/* ve_owner == NULL will be visible */
>  	synchronize_rcu();
> +
> +	/*
> +	 * Anyone already waiting in this wq to execute
> +	 * cgroup_release_agent doesn't know that ve_owner is NULL,
> +	 * but we can wait for all of them at flush_workqueue.
> +	 * After it is complete no other cgroup can seep through
> +	 * to this ve's workqueue, so it's safe to shutdown ve.
> +	 */
> +	flush_workqueue(ve->wq);
>  }
>  
>  struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
> @@ -2027,7 +2036,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
>  		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
>  
>  	init_waitqueue_head(&cgrp->offline_waitq);
> -	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
>  }
>  
>  void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
> @@ -5002,7 +5010,6 @@ static void css_free_rwork_fn(struct work_struct *work)
>  		/* cgroup free path */
>  		atomic_dec(&cgrp->root->nr_cgrps);
>  		cgroup1_pidlist_destroy_all(cgrp);
> -		cancel_work_sync(&cgrp->release_agent_work);
>  
>  		if (cgroup_parent(cgrp)) {
>  			/*
> @@ -5578,6 +5585,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  	 * the migration path.
>  	 */
>  	cgrp->self.flags &= ~CSS_ONLINE;
> +	ve_rm_from_release_list(cgrp);
>  
>  	spin_lock_irq(&css_set_lock);
>  	list_for_each_entry(link, &cgrp->cset_links, cset_link)
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 25455264b225..008e372756f9 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -62,6 +62,11 @@ struct ve_struct ve0 = {
>  	.meminfo_val		= VE_MEMINFO_SYSTEM,
>  	.vdso_64		= (struct vdso_image*)&vdso_image_64,
>  	.vdso_32		= (struct vdso_image*)&vdso_image_32,
> +	.release_list_lock	= __SPIN_LOCK_UNLOCKED(
> +					ve0.release_list_lock),
> +	.release_list		= LIST_HEAD_INIT(ve0.release_list),
> +	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
> +					cgroup1_release_agent),
>  };
>  EXPORT_SYMBOL(ve0);
>  
> @@ -403,6 +408,44 @@ static void ve_workqueue_stop(struct ve_struct *ve)
>  	destroy_workqueue(ve->wq);
>  }
>  
> +void ve_add_to_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	unsigned long flags;
> +	int need_schedule_work = 0;
> +
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +
> +	spin_lock_irqsave(&ve->release_list_lock, flags);
> +	if (!cgroup_is_removed(cgrp) &&
> +	    list_empty(&cgrp->release_list)) {
> +		list_add(&cgrp->release_list, &ve->release_list);
> +		need_schedule_work = 1;
> +	}
> +	spin_unlock_irqrestore(&ve->release_list_lock, flags);
> +
> +	if (need_schedule_work)
> +		queue_work(ve->wq, &ve->release_agent_work);
> +
> +	rcu_read_unlock();
> +}
> +
> +void ve_rm_from_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	unsigned long flags;
> +
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +
> +	spin_lock_irqsave(&ve->release_list_lock, flags);
> +	if (!list_empty(&cgrp->release_list))
> +		list_del_init(&cgrp->release_list);
> +	spin_unlock_irqrestore(&ve->release_list_lock, flags);
> +	rcu_read_unlock();
> +}
> +
>  /* under ve->op_sem write-lock */
>  static int ve_start_container(struct ve_struct *ve)
>  {
> @@ -653,6 +696,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  		goto err_vdso;
>  
>  	ve->features = VE_FEATURES_DEF;
> +
> +	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
> +	spin_lock_init(&ve->release_list_lock);
> +
>  	ve->_randomize_va_space = ve0._randomize_va_space;
>  
>  	ve->odirect_enable = 2;
> @@ -673,6 +720,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  	strcpy(ve->core_pattern, "core");
>  #endif
>  	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->release_list);
>  	mutex_init(&ve->devmnt_mutex);
>  
>  #ifdef CONFIG_AIO
>