[Devel] [PATCH 4/6 v7] ve/cgroup: Moved cgroup release notifications to per-ve workqueues.

Kirill Tkhai ktkhai at virtuozzo.com
Thu Apr 16 11:21:26 MSK 2020


On 15.04.2020 21:26, Valeriy Vdovin wrote:
> Notifications about cgroups getting empty should be executed from
> inside of a VE, that owns this cgroup. This patch splits logic
> of notifications execution between ve code and cgroups code. VE is
> now responsible to store the list of cgroups that want to notify
> about themselves. VE also now manages the execution of release_agent
> work under it's domain, so the usermode process is spawned in this
> VE's namespaces.
> VE is now responsible to decline possibility of scheduling release_agent
> works when it is stopping.
> cgroup code is still responsible to manage setting/getting release_agent
> path parameter and pass it between mounts, but now it's also
> responsible to store owning ve as a filed in cgroup struct.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

This patch is possible to split more, as we talked previous time.

> ---
>  include/linux/cgroup.h |  27 ++++++++
>  include/linux/ve.h     |  17 +++++
>  kernel/cgroup.c        | 168 ++++++++++++++++++++++++++++++++++---------------
>  kernel/ve/ve.c         |  62 +++++++++++++++++-
>  4 files changed, 221 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 0239518..fa5b236 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -292,6 +292,9 @@ struct cgroup {
>  	/* directory xattrs */
>  	struct simple_xattrs xattrs;
>  	u64 subgroups_limit;
> +
> +	/* ve_owner, responsible for running release agent. */
> +	struct ve_struct *ve_owner;
>  };
>  
>  #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -443,6 +446,27 @@ struct css_set {
>  };
>  
>  /*
> + * refcounted get/put for css_set objects
> + */
> +
> +void __put_css_set(struct css_set *cg, int taskexit);
> +
> +static inline void get_css_set(struct css_set *cg)
> +{
> +	atomic_inc(&cg->refcount);
> +}
> +
> +static inline void put_css_set(struct css_set *cg)
> +{
> +	__put_css_set(cg, 0);
> +}
> +
> +static inline void put_css_set_taskexit(struct css_set *cg)
> +{
> +	__put_css_set(cg, 1);
> +}
> +
> +/*
>   * cgroup_map_cb is an abstract callback API for reporting map-valued
>   * control files
>   */
> @@ -613,9 +637,12 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
>  int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
>  
>  int cgroup_task_count(const struct cgroup *cgrp);
> +void cgroup_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
>  void cgroup_mark_ve_roots(struct ve_struct *ve);
> +void cgroup_unmark_ve_roots(struct ve_struct *ve);
> +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>  #endif
>  
>  /*
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 362dae1..989f73a 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -126,7 +126,22 @@ struct ve_struct {
>  #endif
>  	struct kmapset_key	sysfs_perms_key;
>  
> +	/*
> +	 * cgroups, that want to notify about becoming
> +	 * empty, are linked to this release_list.
> +	 */
> +	struct list_head	release_list;
> +	struct raw_spinlock	release_list_lock;
> +
>  	struct workqueue_struct	*wq;
> +	struct work_struct	release_agent_work;
> +
> +	/*
> +	 * All tasks, that belong to this ve, live
> +	 * in cgroups, that are children to cgroups
> +	 * that form this css_set.
> +	 */
> +	struct css_set __rcu	*root_css_set;
>  };
>  
>  struct ve_devmnt {
> @@ -190,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
>  }
>  void do_update_load_avg_ve(void);
>  
> +void ve_add_to_release_list(struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct cgroup *cgrp);
>  extern struct ve_struct *get_ve(struct ve_struct *ve);
>  extern void put_ve(struct ve_struct *ve);
>  
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index fcbf4e59..c6af875 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -269,10 +269,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
>  
>  /* the list of cgroups eligible for automatic release. Protected by
>   * release_list_lock */
> -static LIST_HEAD(release_list);
> -static DEFINE_RAW_SPINLOCK(release_list_lock);
> -static void cgroup_release_agent(struct work_struct *work);
> -static DECLARE_WORK(release_agent_work, cgroup_release_agent);
>  static void check_for_release(struct cgroup *cgrp);
>  
>  /* Link structure for associating css_set objects with cgroups */
> @@ -333,7 +329,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
>   * compiled into their kernel but not actually in use */
>  static int use_task_css_set_links __read_mostly;
>  
> -static void __put_css_set(struct css_set *cg, int taskexit)
> +void __put_css_set(struct css_set *cg, int taskexit)
>  {
>  	struct cg_cgroup_link *link;
>  	struct cg_cgroup_link *saved_link;
> @@ -382,24 +378,6 @@ static void __put_css_set(struct css_set *cg, int taskexit)
>  }
>  
>  /*
> - * refcounted get/put for css_set objects
> - */
> -static inline void get_css_set(struct css_set *cg)
> -{
> -	atomic_inc(&cg->refcount);
> -}
> -
> -static inline void put_css_set(struct css_set *cg)
> -{
> -	__put_css_set(cg, 0);
> -}
> -
> -static inline void put_css_set_taskexit(struct css_set *cg)
> -{
> -	__put_css_set(cg, 1);
> -}
> -
> -/*
>   * compare_css_sets - helper function for find_existing_css_set().
>   * @cg: candidate css_set being tested
>   * @old_cg: existing css_set for a task
> @@ -652,6 +630,35 @@ static struct css_set *find_css_set(
>  	return res;
>  }
>  
> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
> +					    struct cgroupfs_root *root)
> +{
> +	struct cgroup *res = NULL;
> +
> +	BUG_ON(!mutex_is_locked(&cgroup_mutex));
> +	read_lock(&css_set_lock);
> +	/*
> +	 * No need to lock the task - since we hold cgroup_mutex the
> +	 * task can't change groups, so the only thing that can happen
> +	 * is that it exits and its css is set back to init_css_set.
> +	 */
> +	if (css_set == &init_css_set) {
> +		res = &root->top_cgroup;
> +	} else {
> +		struct cg_cgroup_link *link;
> +		list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> +			struct cgroup *c = link->cgrp;
> +			if (c->root == root) {
> +				res = c;
> +				break;
> +			}
> +		}
> +	}
> +	read_unlock(&css_set_lock);
> +	BUG_ON(!res);
> +	return res;
> +}
> +
>  /*
>   * Return the cgroup for "task" from the given hierarchy. Must be
>   * called with cgroup_mutex held.
> @@ -794,6 +801,42 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
>  	return name;
>  }
>  
> +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> +	/*
> +	 * Find nearest root cgroup, which might be host cgroup root
> +	 * or ve cgroup root.
> +	 *
> +	 *    <host_root_cgroup> -> local_root
> +	 *     \                    ^
> +	 *      <cgroup>            |
> +	 *       \                  |
> +	 *        <cgroup>   --->   from here
> +	 *        \
> +	 *         <ve_root_cgroup> -> local_root
> +	 *         \                   ^
> +	 *          <cgroup>           |
> +	 *          \                  |
> +	 *           <cgroup>  --->    from here
> +	 */
> +	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> +		cgrp = cgrp->parent;
> +
> +	return cgrp;
> +}
> +
> +struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	/* Caller should hold RCU */
> +
> +	cgrp = cgroup_get_local_root(cgrp);
> +	ve = rcu_dereference(cgrp->ve_owner);
> +	if (!ve)
> +		ve = get_ve0();
> +	return ve;
> +}
> +
>  static void cgroup_free_fn(struct work_struct *work)
>  {
>  	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
> @@ -1636,6 +1679,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
>  		const struct cred *cred;
>  		int i;
>  		struct css_set *cg;
> +		root_cgrp->ve_owner = get_exec_env();
>  
>  		BUG_ON(sb->s_root != NULL);
>  
> @@ -4243,6 +4287,7 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
>  		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp->ve_owner = ve;
>  		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>  
>  		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> @@ -4251,6 +4296,25 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>  	mutex_unlock(&cgroup_mutex);
>  }
>  
> +void cgroup_unmark_ve_roots(struct ve_struct *ve)
> +{
> +	struct cgroup *cgrp;
> +	struct cgroupfs_root *root;
> +
> +	mutex_lock(&cgroup_mutex);
> +	for_each_active_root(root) {
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
> +				lockdep_is_held(&cgroup_mutex)));
> +		rcu_assign_pointer(cgrp->ve_owner, NULL);
> +		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +	/* ve_owner == NULL will be visible */
> +	synchronize_rcu();
> +	flush_workqueue(ve->wq);
> +}
> +
>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>  {
>  	struct cgroup *ve_root = NULL;
> @@ -4543,11 +4607,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  
>  	set_bit(CGRP_REMOVED, &cgrp->flags);
>  
> -	raw_spin_lock(&release_list_lock);
> -	if (!list_empty(&cgrp->release_list))
> -		list_del_init(&cgrp->release_list);
> -	raw_spin_unlock(&release_list_lock);
> -
> +	ve_rm_from_release_list(cgrp);
>  	/*
>  	 * Remove @cgrp directory.  The removal puts the base ref but we
>  	 * aren't quite done with @cgrp yet, so hold onto it.
> @@ -5331,17 +5391,7 @@ static void check_for_release(struct cgroup *cgrp)
>  		 * already queued for a userspace notification, queue
>  		 * it now
>  		 */
> -		int need_schedule_work = 0;
> -
> -		raw_spin_lock(&release_list_lock);
> -		if (!cgroup_is_removed(cgrp) &&
> -		    list_empty(&cgrp->release_list)) {
> -			list_add(&cgrp->release_list, &release_list);
> -			need_schedule_work = 1;
> -		}
> -		raw_spin_unlock(&release_list_lock);
> -		if (need_schedule_work)
> -			schedule_work(&release_agent_work);
> +		ve_add_to_release_list(cgrp);
>  	}
>  }
>  
> @@ -5368,25 +5418,37 @@ static void check_for_release(struct cgroup *cgrp)
>   * this routine has no use for the exit status of the release agent
>   * task, so no sense holding our caller up for that.
>   */
> -static void cgroup_release_agent(struct work_struct *work)
> +void cgroup_release_agent(struct work_struct *work)
>  {
> -	BUG_ON(work != &release_agent_work);
> +	struct ve_struct *ve;
> +	ve = container_of(work, struct ve_struct, release_agent_work);
>  	mutex_lock(&cgroup_mutex);
> -	raw_spin_lock(&release_list_lock);
> -	while (!list_empty(&release_list)) {
> +	raw_spin_lock(&ve->release_list_lock);
> +	while (!list_empty(&ve->release_list)) {
>  		char *argv[3], *envp[3];
>  		int i, err;
> +		const char *release_agent;
>  		char *pathbuf = NULL, *agentbuf = NULL;
> -		struct cgroup *cgrp = list_entry(release_list.next,
> -						    struct cgroup,
> -						    release_list);
> +		struct cgroup *cgrp, *root_cgrp;
> +
> +		cgrp = list_entry(ve->release_list.next,
> +				  struct cgroup,
> +				  release_list);
> +
>  		list_del_init(&cgrp->release_list);
> -		raw_spin_unlock(&release_list_lock);
> +		raw_spin_unlock(&ve->release_list_lock);
>  		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>  		if (!pathbuf)
>  			goto continue_free;
> -		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> +		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
> +			goto continue_free;
> +		rcu_read_lock();
> +		root_cgrp = cgroup_get_local_root(cgrp);
> +		if (root_cgrp->ve_owner != ve) {
> +			rcu_read_unlock();
>  			goto continue_free;
> +		}
> +		rcu_read_unlock();
>  		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
>  		if (!agentbuf)
>  			goto continue_free;
> @@ -5406,8 +5468,10 @@ static void cgroup_release_agent(struct work_struct *work)
>  		 * since the exec could involve hitting disk and hence
>  		 * be a slow process */
>  		mutex_unlock(&cgroup_mutex);
> -		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> -		if (err < 0)
> +
> +		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
> +		if (err < 0 && !(ve->init_task->flags & PF_EXITING))
>  			pr_warn_ratelimited("cgroup release_agent "
>  					    "%s %s failed: %d\n",
>  					    agentbuf, pathbuf, err);
> @@ -5416,9 +5480,9 @@ static void cgroup_release_agent(struct work_struct *work)
>   continue_free:
>  		kfree(pathbuf);
>  		kfree(agentbuf);
> -		raw_spin_lock(&release_list_lock);
> +		raw_spin_lock(&ve->release_list_lock);
>  	}
> -	raw_spin_unlock(&release_list_lock);
> +	raw_spin_unlock(&ve->release_list_lock);
>  	mutex_unlock(&cgroup_mutex);
>  }
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 94723e6..7599172 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -87,6 +87,11 @@ struct ve_struct ve0 = {
>  	.netif_max_nr		= INT_MAX,
>  	.arp_neigh_nr		= ATOMIC_INIT(0),
>  	.nd_neigh_nr		= ATOMIC_INIT(0),
> +	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
> +					ve0.release_list_lock),
> +	.release_list		= LIST_HEAD_INIT(ve0.release_list),
> +	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
> +					cgroup_release_agent),
>  };
>  EXPORT_SYMBOL(ve0);
>  
> @@ -450,6 +455,9 @@ static void ve_grab_context(struct ve_struct *ve)
>  
>  	get_task_struct(tsk);
>  	ve->init_task = tsk;
> +	ve->root_css_set = tsk->cgroups;
> +	get_css_set(ve->root_css_set);
> +
>  	ve->init_cred = (struct cred *)get_current_cred();
>  	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
>  	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
> @@ -462,6 +470,9 @@ static void ve_drop_context(struct ve_struct *ve)
>  	put_net(ve->ve_netns);
>  	ve->ve_netns = NULL;
>  
> +	put_css_set_taskexit(ve->root_css_set);
> +	ve->root_css_set = NULL;
> +
>  	/* Allows to dereference init_cred and init_task if ve_ns is set */
>  	rcu_assign_pointer(ve->ve_ns, NULL);
>  	synchronize_rcu();
> @@ -474,7 +485,6 @@ static void ve_drop_context(struct ve_struct *ve)
>  
>  	put_task_struct(ve->init_task);
>  	ve->init_task = NULL;
> -
>  }
>  
>  static const struct timespec zero_time = { };
> @@ -494,6 +504,50 @@ static void ve_workqueue_stop(struct ve_struct *ve)
>  	destroy_workqueue(ve->wq);
>  }
>  
> +void ve_add_to_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	int need_schedule_work = 0;
> +
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!cgroup_is_removed(cgrp) &&
> +	    list_empty(&cgrp->release_list)) {
> +		list_add(&cgrp->release_list, &ve->release_list);
> +		need_schedule_work = 1;
> +	}
> +	raw_spin_unlock(&ve->release_list_lock);
> +	/*
> +	 * Because we are under ve->op_sem lock, ve->is_running will
> +	 * be set only after this is already scheduled to ve->wq.
> +	 * Guaranteed to wait until this work is flushed before
> +	 * proceeding with ve_stop_ns.
> +	 */
> +	if (need_schedule_work)
> +		queue_work(ve->wq, &ve->release_agent_work);
> +
> +	rcu_read_unlock();
> +}
> +
> +void ve_rm_from_release_list(struct cgroup *cgrp)
> +{
> +	struct ve_struct *ve;
> +	rcu_read_lock();
> +	ve = cgroup_get_ve_owner(cgrp);
> +	rcu_read_unlock();
> +
> +	/*
> +	 * There is no is_runnig check here, because the caller is
> +	 * is more flexible in using ve->op_sem lock from above.
> +	 */
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!list_empty(&cgrp->release_list))
> +		list_del_init(&cgrp->release_list);
> +	raw_spin_unlock(&ve->release_list_lock);
> +}
> +
>  /* under ve->op_sem write-lock */
>  static int ve_start_container(struct ve_struct *ve)
>  {
> @@ -614,6 +668,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>  	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>  		return;
>  
> +	cgroup_unmark_ve_roots(ve);
> +
>  	ve_workqueue_stop(ve);
>  
>  	/*
> @@ -687,6 +743,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>  	if (!ve->ve_name)
>  		goto err_name;
>  
> +	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
> +	raw_spin_lock_init(&ve->release_list_lock);
> +
>  	ve->_randomize_va_space = ve0._randomize_va_space;
>  
>  	ve->features = VE_FEATURES_DEF;
> @@ -721,6 +780,7 @@ do_init:
>  	INIT_LIST_HEAD(&ve->devices);
>  	INIT_LIST_HEAD(&ve->ve_list);
>  	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->release_list);
>  	mutex_init(&ve->devmnt_mutex);
>  
>  #ifdef CONFIG_AIO
> 



More information about the Devel mailing list