[Devel] [PATCH 2/2 v1] ve/cgroup: Added pointers to owning ve to root cgroups

Tue Mar 17 13:28:15 MSK 2020

Follow-up patch to per-cgroup release_agent property. release_agent
notifications are spawned from a special kthread, running under ve0. But
newly spawned tasks should run under their own ve context. Easy way to
pass this information to a spawning thread is by adding 've_owner' field
to a root cgroup. At notification any cgroup can be walked upwards to
it's root and get ve_owner from there.

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 include/linux/cgroup.h |  3 +++
 include/linux/ve.h     |  8 ++++++++
 kernel/cgroup.c        | 33 +++++++++++++++++++++++++++++++++
 kernel/ve/ve.c         |  3 +++
 4 files changed, 47 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cad5b4f..513658b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -286,6 +286,9 @@ struct cgroup {
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
 
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
+
 	/*
 	 * Per-cgroup path to release agent binary for release
 	 * notifications.
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 9d60838..9cc5257 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -268,6 +268,14 @@ static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 struct seq_file;
 struct kernel_cpustat;
 
+/*
+ * cgroup needs to know it's owning ve for some of operations, but
+ * cgroup's lifetime is independant of ve's, in theory ve can be destroyed
+ * earlier than some of it's cgroups.
+ */
+void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp);
+void ve_remove_referring_cgroups(struct ve_struct *ve);
+
 #if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
 int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
 int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0b64d88..105536b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4318,6 +4318,7 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = ve;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 		err = cgroup_add_file_on_mark_ve(cgrp);
 		if (err)
@@ -4329,6 +4330,19 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
 	return err;
 }
 
+void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = NULL;
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
 	struct cgroup *ve_root = NULL;
@@ -5455,6 +5469,7 @@ static void cgroup_release_agent(struct work_struct *work)
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
+		struct ve_struct *ve;
 		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *root_cgrp;
@@ -5468,7 +5483,20 @@ static void cgroup_release_agent(struct work_struct *work)
 			goto continue_free;
 		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
 			goto continue_free;
+
+		/*
+		 * root_cgrp is the relative root for cgrp, for host
+		 * cgroups root_cgrp is root->top_cgroup, for container
+		 * cgroups it is any up the parent chain from cgrp marked
+		 * as VE_ROOT.
+		 */
 		root_cgrp = cgroup_get_local_root(cgrp);
+
+		ve = NULL;
+		if (root_cgrp->ve_owner)
+			ve = root_cgrp->ve_owner;
+		if (!ve)
+			goto continue_free;
 		if (root_cgrp->release_agent_path)
 			agentbuf = kstrdup(root_cgrp->release_agent_path,
 				GFP_KERNEL);
@@ -5490,7 +5518,12 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
+#ifdef CONFIG_VE
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+#else
 		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+#endif
 		if (err < 0)
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index a64b4a7..37353fb 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -480,6 +480,7 @@ static void ve_drop_context(struct ve_struct *ve)
 static const struct timespec zero_time = { };
 
 extern int cgroup_mark_ve_root(struct ve_struct *ve);
+extern void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
 
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
@@ -588,10 +589,12 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
 	up_write(&ve->op_sem);
 }
 
 void ve_exit_ns(struct pid_namespace *pid_ns)
 {
 	struct ve_struct *ve = current->task_ve;
 
+	cgroup_unbind_roots_from_ve(ve);
 	/*
 	 * current->cgroups already switched to init_css_set in cgroup_exit(),
 	 * but current->task_ve still points to our exec ve.
-- 
1.8.3.1