[Devel] [PATCH 4/6 v7] ve/cgroup: Moved cgroup release notifications to per-ve workqueues.

Wed Apr 15 21:26:36 MSK 2020

Notifications about cgroups getting empty should be executed from
inside of a VE, that owns this cgroup. This patch splits logic
of notifications execution between ve code and cgroups code. VE is
now responsible to store the list of cgroups that want to notify
about themselves. VE also now manages the execution of release_agent
work under it's domain, so the usermode process is spawned in this
VE's namespaces.
VE is now responsible to decline possibility of scheduling release_agent
works when it is stopping.
cgroup code is still responsible to manage setting/getting release_agent
path parameter and pass it between mounts, but now it's also
responsible to store owning ve as a filed in cgroup struct.

Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 include/linux/cgroup.h |  27 ++++++++
 include/linux/ve.h     |  17 +++++
 kernel/cgroup.c        | 168 ++++++++++++++++++++++++++++++++++---------------
 kernel/ve/ve.c         |  62 +++++++++++++++++-
 4 files changed, 221 insertions(+), 53 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0239518..fa5b236 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -292,6 +292,9 @@ struct cgroup {
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
+
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -443,6 +446,27 @@ struct css_set {
 };
 
 /*
+ * refcounted get/put for css_set objects
+ */
+
+void __put_css_set(struct css_set *cg, int taskexit);
+
+static inline void get_css_set(struct css_set *cg)
+{
+	atomic_inc(&cg->refcount);
+}
+
+static inline void put_css_set(struct css_set *cg)
+{
+	__put_css_set(cg, 0);
+}
+
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+	__put_css_set(cg, 1);
+}
+
+/*
  * cgroup_map_cb is an abstract callback API for reporting map-valued
  * control files
  */
@@ -613,9 +637,12 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
+void cgroup_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
 void cgroup_mark_ve_roots(struct ve_struct *ve);
+void cgroup_unmark_ve_roots(struct ve_struct *ve);
+struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
 
 /*
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 362dae1..989f73a 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -126,7 +126,22 @@ struct ve_struct {
 #endif
 	struct kmapset_key	sysfs_perms_key;
 
+	/*
+	 * cgroups, that want to notify about becoming
+	 * empty, are linked to this release_list.
+	 */
+	struct list_head	release_list;
+	struct raw_spinlock	release_list_lock;
+
 	struct workqueue_struct	*wq;
+	struct work_struct	release_agent_work;
+
+	/*
+	 * All tasks, that belong to this ve, live
+	 * in cgroups, that are children to cgroups
+	 * that form this css_set.
+	 */
+	struct css_set __rcu	*root_css_set;
 };
 
 struct ve_devmnt {
@@ -190,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
 }
 void do_update_load_avg_ve(void);
 
+void ve_add_to_release_list(struct cgroup *cgrp);
+void ve_rm_from_release_list(struct cgroup *cgrp);
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fcbf4e59..c6af875 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -269,10 +269,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
 
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
-static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /* Link structure for associating css_set objects with cgroups */
@@ -333,7 +329,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-static void __put_css_set(struct css_set *cg, int taskexit)
+void __put_css_set(struct css_set *cg, int taskexit)
 {
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
@@ -382,24 +378,6 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 }
 
 /*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
-{
-	atomic_inc(&cg->refcount);
-}
-
-static inline void put_css_set(struct css_set *cg)
-{
-	__put_css_set(cg, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
-	__put_css_set(cg, 1);
-}
-
-/*
  * compare_css_sets - helper function for find_existing_css_set().
  * @cg: candidate css_set being tested
  * @old_cg: existing css_set for a task
@@ -652,6 +630,35 @@ static struct css_set *find_css_set(
 	return res;
 }
 
+static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
+					    struct cgroupfs_root *root)
+{
+	struct cgroup *res = NULL;
+
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	read_lock(&css_set_lock);
+	/*
+	 * No need to lock the task - since we hold cgroup_mutex the
+	 * task can't change groups, so the only thing that can happen
+	 * is that it exits and its css is set back to init_css_set.
+	 */
+	if (css_set == &init_css_set) {
+		res = &root->top_cgroup;
+	} else {
+		struct cg_cgroup_link *link;
+		list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
+			struct cgroup *c = link->cgrp;
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	read_unlock(&css_set_lock);
+	BUG_ON(!res);
+	return res;
+}
+
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
@@ -794,6 +801,42 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 	return name;
 }
 
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
+{
+	/*
+	 * Find nearest root cgroup, which might be host cgroup root
+	 * or ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		cgrp = cgrp->parent;
+
+	return cgrp;
+}
+
+struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	/* Caller should hold RCU */
+
+	cgrp = cgroup_get_local_root(cgrp);
+	ve = rcu_dereference(cgrp->ve_owner);
+	if (!ve)
+		ve = get_ve0();
+	return ve;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -1636,6 +1679,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		const struct cred *cred;
 		int i;
 		struct css_set *cg;
+		root_cgrp->ve_owner = get_exec_env();
 
 		BUG_ON(sb->s_root != NULL);
 
@@ -4243,6 +4287,7 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = ve;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
@@ -4251,6 +4296,25 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
 	mutex_unlock(&cgroup_mutex);
 }
 
+void cgroup_unmark_ve_roots(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
+				lockdep_is_held(&cgroup_mutex)));
+		rcu_assign_pointer(cgrp->ve_owner, NULL);
+		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	mutex_unlock(&cgroup_mutex);
+	/* ve_owner == NULL will be visible */
+	synchronize_rcu();
+	flush_workqueue(ve->wq);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
 	struct cgroup *ve_root = NULL;
@@ -4543,11 +4607,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 
-	raw_spin_lock(&release_list_lock);
-	if (!list_empty(&cgrp->release_list))
-		list_del_init(&cgrp->release_list);
-	raw_spin_unlock(&release_list_lock);
-
+	ve_rm_from_release_list(cgrp);
 	/*
 	 * Remove @cgrp directory.  The removal puts the base ref but we
 	 * aren't quite done with @cgrp yet, so hold onto it.
@@ -5331,17 +5391,7 @@ static void check_for_release(struct cgroup *cgrp)
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
-		int need_schedule_work = 0;
-
-		raw_spin_lock(&release_list_lock);
-		if (!cgroup_is_removed(cgrp) &&
-		    list_empty(&cgrp->release_list)) {
-			list_add(&cgrp->release_list, &release_list);
-			need_schedule_work = 1;
-		}
-		raw_spin_unlock(&release_list_lock);
-		if (need_schedule_work)
-			schedule_work(&release_agent_work);
+		ve_add_to_release_list(cgrp);
 	}
 }
 
@@ -5368,25 +5418,37 @@ static void check_for_release(struct cgroup *cgrp)
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
-static void cgroup_release_agent(struct work_struct *work)
+void cgroup_release_agent(struct work_struct *work)
 {
-	BUG_ON(work != &release_agent_work);
+	struct ve_struct *ve;
+	ve = container_of(work, struct ve_struct, release_agent_work);
 	mutex_lock(&cgroup_mutex);
-	raw_spin_lock(&release_list_lock);
-	while (!list_empty(&release_list)) {
+	raw_spin_lock(&ve->release_list_lock);
+	while (!list_empty(&ve->release_list)) {
 		char *argv[3], *envp[3];
 		int i, err;
+		const char *release_agent;
 		char *pathbuf = NULL, *agentbuf = NULL;
-		struct cgroup *cgrp = list_entry(release_list.next,
-						    struct cgroup,
-						    release_list);
+		struct cgroup *cgrp, *root_cgrp;
+
+		cgrp = list_entry(ve->release_list.next,
+				  struct cgroup,
+				  release_list);
+
 		list_del_init(&cgrp->release_list);
-		raw_spin_unlock(&release_list_lock);
+		raw_spin_unlock(&ve->release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
+			goto continue_free;
+		rcu_read_lock();
+		root_cgrp = cgroup_get_local_root(cgrp);
+		if (root_cgrp->ve_owner != ve) {
+			rcu_read_unlock();
 			goto continue_free;
+		}
+		rcu_read_unlock();
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
@@ -5406,8 +5468,10 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-		if (err < 0)
+
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+		if (err < 0 && !(ve->init_task->flags & PF_EXITING))
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
 					    agentbuf, pathbuf, err);
@@ -5416,9 +5480,9 @@ static void cgroup_release_agent(struct work_struct *work)
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
-		raw_spin_lock(&release_list_lock);
+		raw_spin_lock(&ve->release_list_lock);
 	}
-	raw_spin_unlock(&release_list_lock);
+	raw_spin_unlock(&ve->release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 94723e6..7599172 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -87,6 +87,11 @@ struct ve_struct ve0 = {
 	.netif_max_nr		= INT_MAX,
 	.arp_neigh_nr		= ATOMIC_INIT(0),
 	.nd_neigh_nr		= ATOMIC_INIT(0),
+	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
+					ve0.release_list_lock),
+	.release_list		= LIST_HEAD_INIT(ve0.release_list),
+	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
+					cgroup_release_agent),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -450,6 +455,9 @@ static void ve_grab_context(struct ve_struct *ve)
 
 	get_task_struct(tsk);
 	ve->init_task = tsk;
+	ve->root_css_set = tsk->cgroups;
+	get_css_set(ve->root_css_set);
+
 	ve->init_cred = (struct cred *)get_current_cred();
 	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
 	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
@@ -462,6 +470,9 @@ static void ve_drop_context(struct ve_struct *ve)
 	put_net(ve->ve_netns);
 	ve->ve_netns = NULL;
 
+	put_css_set_taskexit(ve->root_css_set);
+	ve->root_css_set = NULL;
+
 	/* Allows to dereference init_cred and init_task if ve_ns is set */
 	rcu_assign_pointer(ve->ve_ns, NULL);
 	synchronize_rcu();
@@ -474,7 +485,6 @@ static void ve_drop_context(struct ve_struct *ve)
 
 	put_task_struct(ve->init_task);
 	ve->init_task = NULL;
-
 }
 
 static const struct timespec zero_time = { };
@@ -494,6 +504,50 @@ static void ve_workqueue_stop(struct ve_struct *ve)
 	destroy_workqueue(ve->wq);
 }
 
+void ve_add_to_release_list(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	int need_schedule_work = 0;
+
+	rcu_read_lock();
+	ve = cgroup_get_ve_owner(cgrp);
+
+	raw_spin_lock(&ve->release_list_lock);
+	if (!cgroup_is_removed(cgrp) &&
+	    list_empty(&cgrp->release_list)) {
+		list_add(&cgrp->release_list, &ve->release_list);
+		need_schedule_work = 1;
+	}
+	raw_spin_unlock(&ve->release_list_lock);
+	/*
+	 * Because we are under ve->op_sem lock, ve->is_running will
+	 * be set only after this is already scheduled to ve->wq.
+	 * Guaranteed to wait until this work is flushed before
+	 * proceeding with ve_stop_ns.
+	 */
+	if (need_schedule_work)
+		queue_work(ve->wq, &ve->release_agent_work);
+
+	rcu_read_unlock();
+}
+
+void ve_rm_from_release_list(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	rcu_read_lock();
+	ve = cgroup_get_ve_owner(cgrp);
+	rcu_read_unlock();
+
+	/*
+	 * There is no is_runnig check here, because the caller is
+	 * is more flexible in using ve->op_sem lock from above.
+	 */
+	raw_spin_lock(&ve->release_list_lock);
+	if (!list_empty(&cgrp->release_list))
+		list_del_init(&cgrp->release_list);
+	raw_spin_unlock(&ve->release_list_lock);
+}
+
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
 {
@@ -614,6 +668,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
 		return;
 
+	cgroup_unmark_ve_roots(ve);
+
 	ve_workqueue_stop(ve);
 
 	/*
@@ -687,6 +743,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
 	if (!ve->ve_name)
 		goto err_name;
 
+	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
+	raw_spin_lock_init(&ve->release_list_lock);
+
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
 	ve->features = VE_FEATURES_DEF;
@@ -721,6 +780,7 @@ do_init:
 	INIT_LIST_HEAD(&ve->devices);
 	INIT_LIST_HEAD(&ve->ve_list);
 	INIT_LIST_HEAD(&ve->devmnt_list);
+	INIT_LIST_HEAD(&ve->release_list);
 	mutex_init(&ve->devmnt_mutex);
 
 #ifdef CONFIG_AIO
-- 
1.8.3.1