[Devel] [PATCH 2/2 v0] ve/cgroup: Added cross links between cgroups and owning ve

Fri Mar 13 13:09:29 MSK 2020

Follow-up patch to per-cgroup release_agent property. Currently there is
a problem with release_agent notification semantics that all
notification processes spawn under ve0 (host). Due to that any processes
inside of a container expecting a notification will never get it
because notification will never hit container namespaces. To address
this problem a process should be launched from a namespace, relative to
cgroup owning ve. So we need to somehow know the right 've' during
spawning of a notification process. This time it's not possible to use
current->task_ve, because notifications are spawned from a dedicated
kthread which knows which cgroup needs to be released, but knows nothing
about what ve 'owns' this cgroup. Naturally, this can be solved by
adding 've_owner' field to 'struct cgroup'.

It's not enough because cgroup's lifespan isn't bound to ve's, ve
might be destroyed earlier than cgroup and opposite is also true. At any
time a link to ve from cgroup might become invalid without proper
cleanup.

To manage this we add a list of all cgroups into ve structure. All
cgroups that need a reference to owning ve, are added to this list. At
destruction ve will clean all references on itself from cgroups in this
list.

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 include/linux/cgroup.h | 11 ++++++++
 include/linux/ve.h     | 19 +++++++++++++
 kernel/cgroup.c        | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c         | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cad5b4f..833f249 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -287,6 +287,17 @@ struct cgroup {
 	u64 subgroups_limit;
 
 	/*
+	 * Cgroups that have non-empty release_agent_path get in this
+	 * this list. At ve destruction, ve will walk this list
+	 * to unlink itself from each cgroup.
+	 */
+	struct list_head ve_reference;
+
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
+	struct mutex ve_owner_mutex;
+
+	/*
 	 * Per-cgroup path to release agent binary for release
 	 * notifications.
 	 */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 9d60838..49f772c 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -91,6 +91,17 @@ struct ve_struct {
 
 	unsigned long		down_at;
 	struct list_head	cleanup_list;
+	/*
+	 * cgroups that need to be unreferenced from this ve
+	 * at this ve's destruction
+	 */
+	struct list_head	referring_cgroups;
+
+	/*
+	 * operations on referring_cgroups are performed under this
+	 * lock
+	 */
+	spinlock_t referring_cgroups_lock;
 	unsigned long		meminfo_val;
 	int _randomize_va_space;
 
@@ -268,6 +279,14 @@ static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 struct seq_file;
 struct kernel_cpustat;
 
+/*
+ * cgroup needs to know it's owning ve for some of operations, but
+ * cgroup's lifetime is independant of ve's, in theory ve can be destroyed
+ * earlier than some of it's cgroups.
+ */
+void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp);
+void ve_remove_referring_cgroups(struct ve_struct *ve);
+
 #if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
 int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
 int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0b64d88..3ecdb5b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1442,10 +1442,12 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
+	INIT_LIST_HEAD(&cgrp->ve_reference);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
 	simple_xattrs_init(&cgrp->xattrs);
+	mutex_init(&cgrp->ve_owner_mutex);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2325,6 +2327,14 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
+#ifdef CONFIG_VE
+	/*
+	 * This is the only place when we can bind ve information to
+	 * cgroup, assuming that the writer does it inside of the
+	 * right VE.
+	 */
+	ve_add_referring_cgroup(get_exec_env(), cgrp);
+#endif
 	ret = cgroup_set_release_agent_locked(cgrp, buffer);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
@@ -4534,6 +4544,36 @@ static void css_ref_killed_fn(struct percpu_ref *ref)
 	cgroup_css_killed(css->cgroup);
 }
 
+/*
+ * cgroup_unreference_ve removes cgroup's reference to ve
+ * ve_owner_mutex ensures ve pointer is valid, then checks
+ * if cgoup is still in ve's list. Otherwise ve is already
+ * removing cgroup itself, so we dont' need to do it.
+ */
+void cgroup_unreference_ve(struct cgroup *cgrp)
+{
+	struct list_head *pos;
+	/*
+	 * We are only safe to query ve_owner under it's lock, cause
+	 * ve could zero it out at destruction step.
+	 */
+	mutex_lock(&cgrp->ve_owner_mutex);
+	if (cgrp->ve_owner) {
+		struct ve_struct *ve = cgrp->ve_owner;
+		spin_lock(&ve->referring_cgroups_lock);
+		list_for_each(pos, &ve->referring_cgroups) {
+			if (pos == &cgrp->ve_reference) {
+				list_del(&cgrp->ve_reference);
+				break;
+			}
+		}
+		spin_unlock(&ve->referring_cgroups_lock);
+		cgrp->ve_owner = NULL;
+	}
+	mutex_unlock(&cgrp->ve_owner_mutex);
+}
+
+
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
@@ -4645,6 +4685,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
+	cgroup_unreference_ve(cgrp);
+
 	kfree(cgrp->release_agent_path);
 
 	return 0;
@@ -5455,6 +5497,7 @@ static void cgroup_release_agent(struct work_struct *work)
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
+		struct ve_struct *ve;
 		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *root_cgrp;
@@ -5468,7 +5511,33 @@ static void cgroup_release_agent(struct work_struct *work)
 			goto continue_free;
 		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
 			goto continue_free;
+
+		/*
+		 * root_cgrp is the relative root for cgrp, for host
+		 * cgroups root_cgrp is root->top_cgroup, for container
+		 * cgroups it is any up the parent chain from cgrp marked
+		 * as VE_ROOT.
+		 */
 		root_cgrp = cgroup_get_local_root(cgrp);
+
+		/*
+		 * Under lock we are safe to check that ve_owner is not
+		 * NULL. Until cgroup has non-NULL pointer to VE, we are
+		 * safe to increase it's refcount and then until reference
+		 * is held, we are safe to address it's memory. If at this
+		 * point VE undergoes destruction steps, at maximum
+		 * call_usermodehelper_fns_ve will gracefully fail, but
+		 * nobody will care at desctuction.
+		 */
+		ve = NULL;
+		mutex_lock(&root_cgrp->ve_owner_mutex);
+		if (root_cgrp->ve_owner) {
+			ve = root_cgrp->ve_owner;
+			get_ve(ve);
+		}
+		mutex_unlock(&root_cgrp->ve_owner_mutex);
+		if (!ve)
+			goto continue_free;
 		if (root_cgrp->release_agent_path)
 			agentbuf = kstrdup(root_cgrp->release_agent_path,
 				GFP_KERNEL);
@@ -5490,7 +5559,13 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
+#ifdef CONFIG_VE
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+		put_ve(ve);
+#else
 		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+#endif
 		if (err < 0)
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index a64b4a7..4ead5cc 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -575,6 +575,13 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	ve->is_running = 0;
 
 	/*
+	 * After is_running is cleared, we are safe to cleanup
+	 * referring cgroups list. Additional cgroups will fail
+	 * to add themselves on is_running check.
+	 */
+	ve_remove_referring_cgroups(ve);
+
+	/*
 	 * Neither it can be in pseudosuper state
 	 * anymore, setup it again if needed.
 	 */
@@ -704,6 +711,7 @@ do_init:
 	INIT_LIST_HEAD(&ve->devices);
 	INIT_LIST_HEAD(&ve->ve_list);
 	INIT_LIST_HEAD(&ve->devmnt_list);
+	INIT_LIST_HEAD(&ve->referring_cgroups);
 	mutex_init(&ve->devmnt_mutex);
 
 #ifdef CONFIG_AIO
@@ -1713,3 +1721,68 @@ int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
 }
 EXPORT_SYMBOL(ve_get_cpu_stat);
 #endif /* CONFIG_CGROUP_SCHED */
+
+/*
+ * Add cgroup to list of cgroups that hold a pointer to this
+ * ve. List and ve is protected by lock.
+ */
+void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp)
+{
+	/*
+	 * The only case when we are adding a cgroup and ve isn't
+	 * running is during or after ve_stop_ns. And it's already
+	 * late to add.
+	 */
+	if (!ve->is_running)
+		return;
+
+	/*
+	 * ve_owner is protected by lock, so that cgroup could safely
+	 * call get_ve if ve_owner is not NULL
+	 */
+	mutex_lock(&cgrp->ve_owner_mutex);
+	cgrp->ve_owner = ve;
+	mutex_unlock(&cgrp->ve_owner_mutex);
+
+	spin_lock(&ve->referring_cgroups_lock);
+	list_add_tail(&ve->referring_cgroups, &cgrp->ve_reference);
+	spin_unlock(&ve->referring_cgroups_lock);
+}
+
+/*
+ * ve_remove_referring_cgroups is called at destruction of VE to
+ * eliminate
+ */
+void ve_remove_referring_cgroups(struct ve_struct *ve)
+{
+	LIST_HEAD(list);
+	struct list_head *pos, *next;
+	BUG_ON(ve->is_running);
+
+	/*
+	 * Put ve list on stack to resolve cross locking of
+	 * ve_owner_mutex and referring_cgroups_lock.
+	 */
+	spin_lock(&ve->referring_cgroups_lock);
+	list_splice_init(&ve->referring_cgroups, &list);
+	spin_unlock(&ve->referring_cgroups_lock);
+
+	list_for_each_safe(pos, next, &list) {
+		struct cgroup *cgrp;
+		cgrp = list_entry(pos, struct cgroup, ve_reference);
+		list_del(&cgrp->ve_reference);
+		/*
+		 * This lock protects use of ve list on cgroup side.
+		 * 1. cgroup can remove itself from ve's list, then
+		 * it needs to get valid ve owner pointer first
+		 * find itself in ve list, without lock, it can get
+		 * a valid pointer and then start to work with a
+		 * freed ve structure.
+		 * 2. cgroup may want to get_ve at some point, same
+		 * logic as in 1. applies to this case.
+		 */
+		mutex_lock(&cgrp->ve_owner_mutex);
+		cgrp->ve_owner = NULL;
+		mutex_unlock(&cgrp->ve_owner_mutex);
+	}
+}
-- 
1.8.3.1