[Devel] [PATCH RHEL8 COMMIT] ve/cgroup: Move release_agent from system_wq to per-ve workqueues
Konstantin Khorenko
khorenko at virtuozzo.com
Wed Mar 3 20:21:13 MSK 2021
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.5
------>
commit f84e37f8fdb3941a85a675f58999919146d238f6
Author: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Date: Wed Mar 3 20:21:13 2021 +0300
ve/cgroup: Move release_agent from system_wq to per-ve workqueues
Each VE should execute release agent notifications within it's own
workqueue. This way we achieve a more fine-grained control over
release_agent work flushing at VE destruction.
Cherry-picked from vz7 commit
9fbfb5b4cfb8 ("ve/cgroup: moved release_agent from system_wq to per-ve
workqueues")
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
=====================
Patchset description:
ve/cgroup: Port release_agent virtualization from vz7
This patchset ports virtualization of cgroup release_agent
virtualization from vz7.
Major challanges of porting are differences between vz7 and vz8 cgroup
implementations:
- transition of cgroups to kernfs
- slightly changed locking scheme, which relies on css_set_lock in
places, previously relied on cgroup_mutex.
There is a small number of patches that have been ported without
modifications, but most of the patches had suffered a lot of
modification due to the factors described above.
v1:
- original patchset
v2:
- removed port of CGRP_REMOVED due to the use of CSS_ONLINE in VZ8 for
same reason
- changed ve_set(get)_release_agent_path signature for more optimal
- added ve->is_running check before calling userspace executable
v3:
- use goto after check for ve->is_running in last patch
---
include/linux/cgroup-defs.h | 10 ++--
include/linux/cgroup.h | 2 +
include/linux/ve.h | 10 ++++
kernel/cgroup/cgroup-internal.h | 1 +
kernel/cgroup/cgroup-v1.c | 109 +++++++++++++++++++++++++++++-----------
kernel/cgroup/cgroup.c | 12 ++++-
kernel/ve/ve.c | 48 ++++++++++++++++++
7 files changed, 159 insertions(+), 33 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index a8eb94d2f97f..22d84aa0778e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -451,6 +451,13 @@ struct cgroup {
*/
struct list_head cset_links;
+ /*
+ * Linked list running through all cgroups that can
+ * potentially be reaped by the release agent. Protected by
+ * release_list_lock
+ */
+ struct list_head release_list;
+
/*
* On the default hierarchy, a css_set for a cgroup with some
* susbsys disabled will point to css's which are associated with
@@ -488,9 +495,6 @@ struct cgroup {
/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
- /* used to schedule release agent */
- struct work_struct release_agent_work;
-
/* used to store eBPF programs */
struct cgroup_bpf bpf;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 17ee29f4071b..6693cd36fd82 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup1_release_agent(struct work_struct *work);
+
#ifdef CONFIG_VE
extern void cgroup_mark_ve_root(struct ve_struct *ve);
void cgroup_unmark_ve_roots(struct ve_struct *ve);
diff --git a/include/linux/ve.h b/include/linux/ve.h
index d3c1ab840444..2ab39b607708 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -105,7 +105,15 @@ struct ve_struct {
unsigned long aio_nr;
unsigned long aio_max_nr;
#endif
+ /*
+ * cgroups, that want to notify about becoming
+ * empty, are linked to this release_list.
+ */
+ struct list_head release_list;
+ spinlock_t release_list_lock;
+
struct workqueue_struct *wq;
+ struct work_struct release_agent_work;
};
struct ve_devmnt {
@@ -127,6 +135,8 @@ extern int nr_ve;
(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
#ifdef CONFIG_VE
+void ve_add_to_release_list(struct cgroup *cgrp);
+void ve_rm_from_release_list(struct cgroup *cgrp);
extern struct ve_struct *get_ve(struct ve_struct *ve);
extern void put_ve(struct ve_struct *ve);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 829997989c41..4de66630d456 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -135,6 +135,7 @@ extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
extern struct file_system_type cgroup_fs_type;
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp);
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 21a7c36fbf44..c1891317ae3a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp)
{
if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
- schedule_work(&cgrp->release_agent_work);
+ ve_add_to_release_list(cgrp);
}
/*
@@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp,
*/
void cgroup1_release_agent(struct work_struct *work)
{
- struct cgroup *cgrp =
- container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL;
- char *argv[3], *envp[3];
- int ret;
+ struct ve_struct *ve;
+ unsigned long flags;
+ char *agentbuf;
+
+ agentbuf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!agentbuf) {
+ pr_warn("failed to allocate agentbuf\n");
+ return;
+ }
+ ve = container_of(work, struct ve_struct, release_agent_work);
mutex_lock(&cgroup_mutex);
+ spin_lock_irqsave(&ve->release_list_lock, flags);
+ while (!list_empty(&ve->release_list)) {
+ char *argv[3], *envp[3];
+ int i, err;
+ char *pathbuf = NULL;
+ struct cgroup *cgrp, *root_cgrp;
+ const char *release_agent;
+
+ cgrp = list_entry(ve->release_list.next,
+ struct cgroup,
+ release_list);
+ list_del_init(&cgrp->release_list);
+ spin_unlock_irqrestore(&ve->release_list_lock, flags);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathbuf)
+ goto continue_free;
+ /*
+ * At VE destruction root cgroup looses VE_ROOT flag.
+ * Because of that 'cgroup_get_local_root' will not see
+ * VE root and return host's root cgroup instead.
+ * We can detect this because we have a pointer to
+ * original ve coming from work argument.
+ * We do not want to execute VE's notifications on host,
+ * so in this case we skip.
+ */
+ rcu_read_lock();
+ root_cgrp = cgroup_get_local_root(cgrp);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf || !strlen(agentbuf))
- goto out;
+ if (rcu_access_pointer(root_cgrp->ve_owner) != ve) {
+ rcu_read_unlock();
+ goto continue_free;
+ }
- spin_lock_irq(&css_set_lock);
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- if (ret < 0 || ret >= PATH_MAX)
- goto out;
+ if (cgroup_path_ve_relative(root_cgrp, cgrp, pathbuf,
+ PAGE_SIZE) < 0) {
+ rcu_read_unlock();
+ goto continue_free;
+ }
- argv[0] = agentbuf;
- argv[1] = pathbuf;
- argv[2] = NULL;
+ release_agent = ve_get_release_agent_path(root_cgrp);
- /* minimal command environment */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
+ *agentbuf = 0;
+ if (release_agent)
+ strncpy(agentbuf, release_agent, PATH_MAX);
+ rcu_read_unlock();
+ if (!*agentbuf)
+ goto continue_free;
+
+ i = 0;
+ argv[i++] = agentbuf;
+ argv[i++] = pathbuf;
+ argv[i] = NULL;
+
+ i = 0;
+ /* minimal command environment */
+ envp[i++] = "HOME=/";
+ envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[i] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ err = call_usermodehelper_ve(ve, argv[0], argv,
+ envp, UMH_WAIT_EXEC);
+
+ if (err < 0 && ve == &ve0)
+ pr_warn_ratelimited("cgroup1_release_agent "
+ "%s %s failed: %d\n",
+ agentbuf, pathbuf, err);
+ mutex_lock(&cgroup_mutex);
+continue_free:
+ kfree(pathbuf);
+ spin_lock_irqsave(&ve->release_list_lock, flags);
+ }
+ spin_unlock_irqrestore(&ve->release_list_lock, flags);
mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
-out_free:
kfree(agentbuf);
- kfree(pathbuf);
}
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index beb26dd7cd88..abba370eded0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1972,6 +1972,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
spin_unlock_irq(&css_set_lock);
/* ve_owner == NULL will be visible */
synchronize_rcu();
+
+ /*
+ * Anyone already waiting in this wq to execute
+ * cgroup_release_agent doesn't know that ve_owner is NULL,
+ * but we can wait for all of them at flush_workqueue.
+ * After it is complete no other cgroup can seep through
+ * to this ve's workqueue, so it's safe to shutdown ve.
+ */
+ flush_workqueue(ve->wq);
}
struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
@@ -2026,7 +2035,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
@@ -5001,7 +5009,6 @@ static void css_free_rwork_fn(struct work_struct *work)
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
- cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
@@ -5577,6 +5584,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
+ ve_rm_from_release_list(cgrp);
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 25455264b225..934a5ff1c9bb 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -62,6 +62,11 @@ struct ve_struct ve0 = {
.meminfo_val = VE_MEMINFO_SYSTEM,
.vdso_64 = (struct vdso_image*)&vdso_image_64,
.vdso_32 = (struct vdso_image*)&vdso_image_32,
+ .release_list_lock = __SPIN_LOCK_UNLOCKED(
+ ve0.release_list_lock),
+ .release_list = LIST_HEAD_INIT(ve0.release_list),
+ .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work,
+ cgroup1_release_agent),
};
EXPORT_SYMBOL(ve0);
@@ -403,6 +408,44 @@ static void ve_workqueue_stop(struct ve_struct *ve)
destroy_workqueue(ve->wq);
}
+void ve_add_to_release_list(struct cgroup *cgrp)
+{
+ struct ve_struct *ve;
+ unsigned long flags;
+ int need_schedule_work = 0;
+
+ rcu_read_lock();
+ ve = cgroup_get_ve_owner(cgrp);
+
+ spin_lock_irqsave(&ve->release_list_lock, flags);
+ if (!cgroup_is_dead(cgrp) &&
+ list_empty(&cgrp->release_list)) {
+ list_add(&cgrp->release_list, &ve->release_list);
+ need_schedule_work = 1;
+ }
+ spin_unlock_irqrestore(&ve->release_list_lock, flags);
+
+ if (need_schedule_work)
+ queue_work(ve->wq, &ve->release_agent_work);
+
+ rcu_read_unlock();
+}
+
+void ve_rm_from_release_list(struct cgroup *cgrp)
+{
+ struct ve_struct *ve;
+ unsigned long flags;
+
+ rcu_read_lock();
+ ve = cgroup_get_ve_owner(cgrp);
+
+ spin_lock_irqsave(&ve->release_list_lock, flags);
+ if (!list_empty(&cgrp->release_list))
+ list_del_init(&cgrp->release_list);
+ spin_unlock_irqrestore(&ve->release_list_lock, flags);
+ rcu_read_unlock();
+}
+
/* under ve->op_sem write-lock */
static int ve_start_container(struct ve_struct *ve)
{
@@ -653,6 +696,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
goto err_vdso;
ve->features = VE_FEATURES_DEF;
+
+ INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
+ spin_lock_init(&ve->release_list_lock);
+
ve->_randomize_va_space = ve0._randomize_va_space;
ve->odirect_enable = 2;
@@ -673,6 +720,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
strcpy(ve->core_pattern, "core");
#endif
INIT_LIST_HEAD(&ve->devmnt_list);
+ INIT_LIST_HEAD(&ve->release_list);
mutex_init(&ve->devmnt_mutex);
#ifdef CONFIG_AIO
More information about the Devel
mailing list