[Devel] [PATCH 2/2 v0] ve/cgroup: Added cross links between cgroups and owning ve
Valeriy Vdovin
valeriy.vdovin at virtuozzo.com
Fri Mar 13 13:09:29 MSK 2020
Follow-up patch to per-cgroup release_agent property. Currently there is
a problem with release_agent notification semantics that all
notification processes spawn under ve0 (host). Due to that any processes
inside of a container expecting a notification will never get it
because notification will never hit container namespaces. To address
this problem a process should be launched from a namespace, relative to
cgroup owning ve. So we need to somehow know the right 've' during
spawning of a notification process. This time it's not possible to use
current->task_ve, because notifications are spawned from a dedicated
kthread which knows which cgroup needs to be released, but knows nothing
about what ve 'owns' this cgroup. Naturally, this can be solved by
adding 've_owner' field to 'struct cgroup'.
It's not enough because cgroup's lifespan isn't bound to ve's, ve
might be destroyed earlier than cgroup and opposite is also true. At any
time a link to ve from cgroup might become invalid without proper
cleanup.
To manage this we add a list of all cgroups into ve structure. All
cgroups that need a reference to owning ve, are added to this list. At
destruction ve will clean all references on itself from cgroups in this
list.
https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
include/linux/cgroup.h | 11 ++++++++
include/linux/ve.h | 19 +++++++++++++
kernel/cgroup.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/ve/ve.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 178 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cad5b4f..833f249 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -287,6 +287,17 @@ struct cgroup {
u64 subgroups_limit;
/*
+ * Cgroups that have non-empty release_agent_path get in this
+ * this list. At ve destruction, ve will walk this list
+ * to unlink itself from each cgroup.
+ */
+ struct list_head ve_reference;
+
+ /* ve_owner, responsible for running release agent. */
+ struct ve_struct *ve_owner;
+ struct mutex ve_owner_mutex;
+
+ /*
* Per-cgroup path to release agent binary for release
* notifications.
*/
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 9d60838..49f772c 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -91,6 +91,17 @@ struct ve_struct {
unsigned long down_at;
struct list_head cleanup_list;
+ /*
+ * cgroups that need to be unreferenced from this ve
+ * at this ve's destruction
+ */
+ struct list_head referring_cgroups;
+
+ /*
+ * operations on referring_cgroups are performed under this
+ * lock
+ */
+ spinlock_t referring_cgroups_lock;
unsigned long meminfo_val;
int _randomize_va_space;
@@ -268,6 +279,14 @@ static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
struct seq_file;
struct kernel_cpustat;
+/*
+ * cgroup needs to know it's owning ve for some of operations, but
+ * cgroup's lifetime is independant of ve's, in theory ve can be destroyed
+ * earlier than some of it's cgroups.
+ */
+void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp);
+void ve_remove_referring_cgroups(struct ve_struct *ve);
+
#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0b64d88..3ecdb5b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1442,10 +1442,12 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_LIST_HEAD(&cgrp->ve_reference);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
simple_xattrs_init(&cgrp->xattrs);
+ mutex_init(&cgrp->ve_owner_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -2325,6 +2327,14 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
+#ifdef CONFIG_VE
+ /*
+ * This is the only place when we can bind ve information to
+ * cgroup, assuming that the writer does it inside of the
+ * right VE.
+ */
+ ve_add_referring_cgroup(get_exec_env(), cgrp);
+#endif
ret = cgroup_set_release_agent_locked(cgrp, buffer);
mutex_unlock(&cgroup_mutex);
return ret;
@@ -4534,6 +4544,36 @@ static void css_ref_killed_fn(struct percpu_ref *ref)
cgroup_css_killed(css->cgroup);
}
+/*
+ * cgroup_unreference_ve removes cgroup's reference to ve
+ * ve_owner_mutex ensures ve pointer is valid, then checks
+ * if cgoup is still in ve's list. Otherwise ve is already
+ * removing cgroup itself, so we dont' need to do it.
+ */
+void cgroup_unreference_ve(struct cgroup *cgrp)
+{
+ struct list_head *pos;
+ /*
+ * We are only safe to query ve_owner under it's lock, cause
+ * ve could zero it out at destruction step.
+ */
+ mutex_lock(&cgrp->ve_owner_mutex);
+ if (cgrp->ve_owner) {
+ struct ve_struct *ve = cgrp->ve_owner;
+ spin_lock(&ve->referring_cgroups_lock);
+ list_for_each(pos, &ve->referring_cgroups) {
+ if (pos == &cgrp->ve_reference) {
+ list_del(&cgrp->ve_reference);
+ break;
+ }
+ }
+ spin_unlock(&ve->referring_cgroups_lock);
+ cgrp->ve_owner = NULL;
+ }
+ mutex_unlock(&cgrp->ve_owner_mutex);
+}
+
+
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
@@ -4645,6 +4685,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
}
spin_unlock(&cgrp->event_list_lock);
+ cgroup_unreference_ve(cgrp);
+
kfree(cgrp->release_agent_path);
return 0;
@@ -5455,6 +5497,7 @@ static void cgroup_release_agent(struct work_struct *work)
raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
+ struct ve_struct *ve;
int i, err;
char *pathbuf = NULL, *agentbuf = NULL;
struct cgroup *root_cgrp;
@@ -5468,7 +5511,33 @@ static void cgroup_release_agent(struct work_struct *work)
goto continue_free;
if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
goto continue_free;
+
+ /*
+ * root_cgrp is the relative root for cgrp, for host
+ * cgroups root_cgrp is root->top_cgroup, for container
+ * cgroups it is any up the parent chain from cgrp marked
+ * as VE_ROOT.
+ */
root_cgrp = cgroup_get_local_root(cgrp);
+
+ /*
+ * Under lock we are safe to check that ve_owner is not
+ * NULL. Until cgroup has non-NULL pointer to VE, we are
+ * safe to increase it's refcount and then until reference
+ * is held, we are safe to address it's memory. If at this
+ * point VE undergoes destruction steps, at maximum
+ * call_usermodehelper_fns_ve will gracefully fail, but
+ * nobody will care at desctuction.
+ */
+ ve = NULL;
+ mutex_lock(&root_cgrp->ve_owner_mutex);
+ if (root_cgrp->ve_owner) {
+ ve = root_cgrp->ve_owner;
+ get_ve(ve);
+ }
+ mutex_unlock(&root_cgrp->ve_owner_mutex);
+ if (!ve)
+ goto continue_free;
if (root_cgrp->release_agent_path)
agentbuf = kstrdup(root_cgrp->release_agent_path,
GFP_KERNEL);
@@ -5490,7 +5559,13 @@ static void cgroup_release_agent(struct work_struct *work)
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock(&cgroup_mutex);
+#ifdef CONFIG_VE
+ err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+ envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+ put_ve(ve);
+#else
err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+#endif
if (err < 0)
pr_warn_ratelimited("cgroup release_agent "
"%s %s failed: %d\n",
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index a64b4a7..4ead5cc 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -575,6 +575,13 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
ve->is_running = 0;
/*
+ * After is_running is cleared, we are safe to cleanup
+ * referring cgroups list. Additional cgroups will fail
+ * to add themselves on is_running check.
+ */
+ ve_remove_referring_cgroups(ve);
+
+ /*
* Neither it can be in pseudosuper state
* anymore, setup it again if needed.
*/
@@ -704,6 +711,7 @@ do_init:
INIT_LIST_HEAD(&ve->devices);
INIT_LIST_HEAD(&ve->ve_list);
INIT_LIST_HEAD(&ve->devmnt_list);
+ INIT_LIST_HEAD(&ve->referring_cgroups);
mutex_init(&ve->devmnt_mutex);
#ifdef CONFIG_AIO
@@ -1713,3 +1721,68 @@ int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
}
EXPORT_SYMBOL(ve_get_cpu_stat);
#endif /* CONFIG_CGROUP_SCHED */
+
+/*
+ * Add cgroup to list of cgroups that hold a pointer to this
+ * ve. List and ve is protected by lock.
+ */
+void ve_add_referring_cgroup(struct ve_struct *ve, struct cgroup *cgrp)
+{
+ /*
+ * The only case when we are adding a cgroup and ve isn't
+ * running is during or after ve_stop_ns. And it's already
+ * late to add.
+ */
+ if (!ve->is_running)
+ return;
+
+ /*
+ * ve_owner is protected by lock, so that cgroup could safely
+ * call get_ve if ve_owner is not NULL
+ */
+ mutex_lock(&cgrp->ve_owner_mutex);
+ cgrp->ve_owner = ve;
+ mutex_unlock(&cgrp->ve_owner_mutex);
+
+ spin_lock(&ve->referring_cgroups_lock);
+ list_add_tail(&ve->referring_cgroups, &cgrp->ve_reference);
+ spin_unlock(&ve->referring_cgroups_lock);
+}
+
+/*
+ * ve_remove_referring_cgroups is called at destruction of VE to
+ * eliminate
+ */
+void ve_remove_referring_cgroups(struct ve_struct *ve)
+{
+ LIST_HEAD(list);
+ struct list_head *pos, *next;
+ BUG_ON(ve->is_running);
+
+ /*
+ * Put ve list on stack to resolve cross locking of
+ * ve_owner_mutex and referring_cgroups_lock.
+ */
+ spin_lock(&ve->referring_cgroups_lock);
+ list_splice_init(&ve->referring_cgroups, &list);
+ spin_unlock(&ve->referring_cgroups_lock);
+
+ list_for_each_safe(pos, next, &list) {
+ struct cgroup *cgrp;
+ cgrp = list_entry(pos, struct cgroup, ve_reference);
+ list_del(&cgrp->ve_reference);
+ /*
+ * This lock protects use of ve list on cgroup side.
+ * 1. cgroup can remove itself from ve's list, then
+ * it needs to get valid ve owner pointer first
+ * find itself in ve list, without lock, it can get
+ * a valid pointer and then start to work with a
+ * freed ve structure.
+ * 2. cgroup may want to get_ve at some point, same
+ * logic as in 1. applies to this case.
+ */
+ mutex_lock(&cgrp->ve_owner_mutex);
+ cgrp->ve_owner = NULL;
+ mutex_unlock(&cgrp->ve_owner_mutex);
+ }
+}
--
1.8.3.1
More information about the Devel
mailing list