[Devel] [PATCH 4/7] ve/cgroup: Moved cgroup release notifications to per-ve workqueues.
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Thu Apr 2 13:00:16 MSK 2020
On 4/1/20 6:41 PM, Valeriy Vdovin wrote:
> Notifications about cgroups getting empty should be executed from
> inside of a VE, that owns this cgroup. This patch splits logic
> of notifications execution between ve code and cgroups code. VE is
> now responsible to store the list of cgroups that want to notify
> about themselves. VE also now manages the execution of release_agent
> work under it's domain, so the usermode process is spawned in this
> VE's namespaces.
> VE is now responsible to decline possibility of scheduling release_agent
> works when it is stopping.
> cgroup code is still responsible to manage setting/getting release_agent
> path parameter and pass it between mounts, but now it's also
> responsible to store owning ve as a filed in cgroup struct.
>
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> include/linux/cgroup.h | 26 +++++++++
> include/linux/ve.h | 17 ++++++
> kernel/cgroup.c | 156 +++++++++++++++++++++++++++++++++----------------
> kernel/ve/ve.c | 93 ++++++++++++++++++++++++++++-
> 4 files changed, 238 insertions(+), 54 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b0b8b9e..86d3e5d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -315,6 +315,9 @@ struct cgroup {
> /* directory xattrs */
> struct simple_xattrs xattrs;
> u64 subgroups_limit;
> +
> + /* ve_owner, responsible for running release agent. */
> + struct ve_struct *ve_owner;
> };
>
> #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -466,6 +469,27 @@ struct css_set {
> };
>
> /*
> + * refcounted get/put for css_set objects
> + */
> +
> +void __put_css_set(struct css_set *cg, int taskexit);
> +
> +static inline void get_css_set(struct css_set *cg)
> +{
> + atomic_inc(&cg->refcount);
> +}
> +
> +static inline void put_css_set(struct css_set *cg)
> +{
> + __put_css_set(cg, 0);
> +}
> +
> +static inline void put_css_set_taskexit(struct css_set *cg)
> +{
> + __put_css_set(cg, 1);
> +}
> +
> +/*
> * cgroup_map_cb is an abstract callback API for reporting map-valued
> * control files
> */
> @@ -636,9 +660,11 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
> int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
>
> int cgroup_task_count(const struct cgroup *cgrp);
> +void cgroup_release_agent(struct work_struct *work);
>
> #ifdef CONFIG_VE
> void cgroup_mark_ve_root(struct ve_struct *ve);
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
> #endif
>
> /*
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 92ec8f9..fcdb25a 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -126,7 +126,22 @@ struct ve_struct {
> #endif
> struct kmapset_key sysfs_perms_key;
>
> + /*
> + * cgroups, that want to notify about becoming
> + * empty, are linked to this release_list.
> + */
> + struct list_head release_list;
> + struct raw_spinlock release_list_lock;
> +
> struct workqueue_struct *wq;
> + struct work_struct release_agent_work;
> +
> + /*
> + * All tasks, that belong to this ve, live
> + * in cgroups, that are children to cgroups
> + * that form this css_set.
> + */
> + struct css_set __rcu *root_css_set;
> };
>
> struct ve_devmnt {
> @@ -190,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
> }
> void do_update_load_avg_ve(void);
>
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp);
> extern struct ve_struct *get_ve(struct ve_struct *ve);
> extern void put_ve(struct ve_struct *ve);
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3df28a2..680ec8e 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -271,10 +271,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
>
> /* the list of cgroups eligible for automatic release. Protected by
> * release_list_lock */
> -static LIST_HEAD(release_list);
> -static DEFINE_RAW_SPINLOCK(release_list_lock);
> -static void cgroup_release_agent(struct work_struct *work);
> -static DECLARE_WORK(release_agent_work, cgroup_release_agent);
> static void check_for_release(struct cgroup *cgrp);
>
> /* Link structure for associating css_set objects with cgroups */
> @@ -335,7 +331,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
> * compiled into their kernel but not actually in use */
> static int use_task_css_set_links __read_mostly;
>
> -static void __put_css_set(struct css_set *cg, int taskexit)
> +void __put_css_set(struct css_set *cg, int taskexit)
> {
> struct cg_cgroup_link *link;
> struct cg_cgroup_link *saved_link;
> @@ -384,24 +380,6 @@ static void __put_css_set(struct css_set *cg, int taskexit)
> }
>
> /*
> - * refcounted get/put for css_set objects
> - */
> -static inline void get_css_set(struct css_set *cg)
> -{
> - atomic_inc(&cg->refcount);
> -}
> -
> -static inline void put_css_set(struct css_set *cg)
> -{
> - __put_css_set(cg, 0);
> -}
> -
> -static inline void put_css_set_taskexit(struct css_set *cg)
> -{
> - __put_css_set(cg, 1);
> -}
> -
> -/*
> * compare_css_sets - helper function for find_existing_css_set().
> * @cg: candidate css_set being tested
> * @old_cg: existing css_set for a task
> @@ -654,6 +632,35 @@ static struct css_set *find_css_set(
> return res;
> }
>
> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
> + struct cgroupfs_root *root)
> +{
> + struct cgroup *res = NULL;
> +
> + BUG_ON(!mutex_is_locked(&cgroup_mutex));
> + read_lock(&css_set_lock);
> + /*
> + * No need to lock the task - since we hold cgroup_mutex the
> + * task can't change groups, so the only thing that can happen
> + * is that it exits and its css is set back to init_css_set.
> + */
> + if (css_set == &init_css_set) {
> + res = &root->top_cgroup;
> + } else {
> + struct cg_cgroup_link *link;
> + list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> + struct cgroup *c = link->cgrp;
> + if (c->root == root) {
> + res = c;
> + break;
> + }
> + }
> + }
> + read_unlock(&css_set_lock);
> + BUG_ON(!res);
> + return res;
> +}
> +
> /*
> * Return the cgroup for "task" from the given hierarchy. Must be
> * called with cgroup_mutex held.
> @@ -796,6 +803,31 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
> return name;
> }
>
> +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> + /*
> + * Find nearest root cgroup, which might be host cgroup root
> + * or ve cgroup root.
> + *
> + * <host_root_cgroup> -> local_root
> + * \ ^
> + * <cgroup> |
> + * \ |
> + * <cgroup> ---> from here
> + * \
> + * <ve_root_cgroup> -> local_root
> + * \ ^
> + * <cgroup> |
> + * \ |
> + * <cgroup> ---> from here
> + */
> + lockdep_assert_held(&cgroup_mutex);
> + while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> + cgrp = cgrp->parent;
> +
> + return cgrp;
> +}
> +
> static void cgroup_free_fn(struct work_struct *work)
> {
> struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
> @@ -1638,6 +1670,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> const struct cred *cred;
> int i;
> struct css_set *cg;
> + root_cgrp->ve_owner = get_exec_env();
>
> BUG_ON(sb->s_root != NULL);
>
> @@ -4320,6 +4353,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
> mutex_lock(&cgroup_mutex);
> for_each_active_root(root) {
> cgrp = task_cgroup_from_root(ve->init_task, root);
> + cgrp->ve_owner = ve;
> set_bit(CGRP_VE_ROOT, &cgrp->flags);
>
> if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> @@ -4328,6 +4362,20 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
> mutex_unlock(&cgroup_mutex);
> }
>
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
> +{
> + struct cgroup *cgrp;
> + struct cgroupfs_root *root;
> +
> + mutex_lock(&cgroup_mutex);
> + for_each_active_root(root) {
> + cgrp = css_cgroup_from_root(ve->root_css_set, root);
> + BUG_ON(!cgrp->ve_owner);
> + cgrp->ve_owner = NULL;
> + }
> + mutex_unlock(&cgroup_mutex);
> +}
> +
> struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
> {
> struct cgroup *ve_root = NULL;
> @@ -4564,6 +4612,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
> struct cgroup_event *event, *tmp;
> struct cgroup_subsys *ss;
> struct cgroup *child;
> + struct cgroup *root_cgrp;
> bool empty;
>
> lockdep_assert_held(&d->d_inode->i_mutex);
> @@ -4620,11 +4669,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>
> set_bit(CGRP_REMOVED, &cgrp->flags);
>
> - raw_spin_lock(&release_list_lock);
> - if (!list_empty(&cgrp->release_list))
> - list_del_init(&cgrp->release_list);
> - raw_spin_unlock(&release_list_lock);
> -
> + root_cgrp = cgroup_get_local_root(cgrp);
> + if (root_cgrp->ve_owner)
> + ve_rm_from_release_list(root_cgrp->ve_owner, cgrp);
> /*
> * Remove @cgrp directory. The removal puts the base ref but we
> * aren't quite done with @cgrp yet, so hold onto it.
> @@ -5399,6 +5446,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>
> static void check_for_release(struct cgroup *cgrp)
> {
> + struct cgroup *root_cgrp;
> /* All of these checks rely on RCU to keep the cgroup
> * structure alive */
> if (cgroup_is_releasable(cgrp) &&
> @@ -5408,17 +5456,10 @@ static void check_for_release(struct cgroup *cgrp)
> * already queued for a userspace notification, queue
> * it now
> */
> - int need_schedule_work = 0;
> -
> - raw_spin_lock(&release_list_lock);
> - if (!cgroup_is_removed(cgrp) &&
> - list_empty(&cgrp->release_list)) {
> - list_add(&cgrp->release_list, &release_list);
> - need_schedule_work = 1;
> - }
> - raw_spin_unlock(&release_list_lock);
> - if (need_schedule_work)
> - schedule_work(&release_agent_work);
> + root_cgrp = cgroup_get_local_root(cgrp);
> + BUG_ON(!root_cgrp);
> + if (root_cgrp->ve_owner)
> + ve_add_to_release_list(root_cgrp->ve_owner, cgrp);
> }
> }
>
> @@ -5445,24 +5486,35 @@ static void check_for_release(struct cgroup *cgrp)
> * this routine has no use for the exit status of the release agent
> * task, so no sense holding our caller up for that.
> */
> -static void cgroup_release_agent(struct work_struct *work)
> +void cgroup_release_agent(struct work_struct *work)
> {
> - BUG_ON(work != &release_agent_work);
> + struct ve_struct *ve;
> + ve = container_of(work, struct ve_struct, release_agent_work);
> + down_read(&ve->op_sem);
> + if (!ve->is_running) {
> + up_read(&ve->op_sem);
> + return;
> + }
> + up_read(&ve->op_sem);
Can you please explain why we have a lock for a single is_running check?
It can switch to zero imediately after up_read.
> mutex_lock(&cgroup_mutex);
> - raw_spin_lock(&release_list_lock);
> - while (!list_empty(&release_list)) {
> + raw_spin_lock(&ve->release_list_lock);
> + while (!list_empty(&ve->release_list)) {
> char *argv[3], *envp[3];
> int i, err;
> + const char *release_agent;
> char *pathbuf = NULL, *agentbuf = NULL;
> - struct cgroup *cgrp = list_entry(release_list.next,
> - struct cgroup,
> - release_list);
> + struct cgroup *cgrp, *root_cgrp;
> +
> + cgrp = list_entry(ve->release_list.next,
> + struct cgroup,
> + release_list);
> +
> list_del_init(&cgrp->release_list);
> - raw_spin_unlock(&release_list_lock);
> + raw_spin_unlock(&ve->release_list_lock);
> pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> if (!pathbuf)
> goto continue_free;
> - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> + if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
> goto continue_free;
> agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
> if (!agentbuf)
> @@ -5483,7 +5535,9 @@ static void cgroup_release_agent(struct work_struct *work)
> * since the exec could involve hitting disk and hence
> * be a slow process */
> mutex_unlock(&cgroup_mutex);
> - err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +
> + err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> + envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
> if (err < 0)
> pr_warn_ratelimited("cgroup release_agent "
> "%s %s failed: %d\n",
> @@ -5493,9 +5547,9 @@ static void cgroup_release_agent(struct work_struct *work)
> continue_free:
> kfree(pathbuf);
> kfree(agentbuf);
> - raw_spin_lock(&release_list_lock);
> + raw_spin_lock(&ve->release_list_lock);
> }
> - raw_spin_unlock(&release_list_lock);
> + raw_spin_unlock(&ve->release_list_lock);
> mutex_unlock(&cgroup_mutex);
> }
>
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 587f445..7eff0e7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -87,6 +87,10 @@ struct ve_struct ve0 = {
> .netif_max_nr = INT_MAX,
> .arp_neigh_nr = ATOMIC_INIT(0),
> .nd_neigh_nr = ATOMIC_INIT(0),
> + .release_list_lock = __RAW_SPIN_LOCK_UNLOCKED(ve0.release_list_lock),
> + .release_list = LIST_HEAD_INIT(ve0.release_list),
> + .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work,
> + cgroup_release_agent),
> };
> EXPORT_SYMBOL(ve0);
>
> @@ -453,6 +457,9 @@ static void ve_grab_context(struct ve_struct *ve)
>
> get_task_struct(tsk);
> ve->init_task = tsk;
> + ve->root_css_set = tsk->cgroups;
> + get_css_set(ve->root_css_set);
> +
> ve->init_cred = (struct cred *)get_current_cred();
> rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
> ve->ve_netns = get_net(ve->ve_ns->net_ns);
> @@ -465,6 +472,9 @@ static void ve_drop_context(struct ve_struct *ve)
> put_net(ve->ve_netns);
> ve->ve_netns = NULL;
>
> + put_css_set_taskexit(ve->root_css_set);
> + ve->root_css_set = NULL;
> +
> /* Allows to dereference init_cred and init_task if ve_ns is set */
> rcu_assign_pointer(ve->ve_ns, NULL);
> synchronize_rcu();
> @@ -477,7 +487,6 @@ static void ve_drop_context(struct ve_struct *ve)
>
> put_task_struct(ve->init_task);
> ve->init_task = NULL;
> -
> }
>
> static const struct timespec zero_time = { };
> @@ -504,6 +513,53 @@ static void ve_workqueue_stop(struct ve_struct *ve)
> destroy_workqueue(ve->wq);
> }
>
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> + int need_schedule_work = 0;
> +
> + down_write(&ve->op_sem);
> +
> + if (!ve->is_running)
> + goto out_unlock;
> +
> + /*
> + * Guarantee dependency is provided like so:
> + * - release_list_lock is valid due to ve != NULL.
> + * - is_running = 1 is valid due to ve->op_sem is held.
> + * - ve->op_sem is valid due to ve != NULL.
> + * - ve != NULL should be guaranteed by the caller.
> + */
> + raw_spin_lock(&ve->release_list_lock);
> + if (!cgroup_is_removed(cgrp) &&
> + list_empty(&cgrp->release_list)) {
> + list_add(&cgrp->release_list, &ve->release_list);
> + need_schedule_work = 1;
> + }
> + raw_spin_unlock(&ve->release_list_lock);
> + /*
> + * Because we are under ve->op_sem lock, ve->is_running will
> + * be set only after this is already scheduled to ve->wq.
> + * Guaranteed to wait until this work is flushed before
> + * proceeding with ve_stop_ns.
> + */
> + if (need_schedule_work)
> + queue_work(ve->wq, &ve->release_agent_work);
> +
> +out_unlock:
> + up_write(&ve->op_sem);
> +}
> +
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> + /*
> + * There is no is_runnig check here, because the caller is
> + * is more flexible in using ve->op_sem lock from above.
> + */
> + raw_spin_lock(&ve->release_list_lock);
> + if (!list_empty(&cgrp->release_list))
> + list_del_init(&cgrp->release_list);
> + raw_spin_unlock(&ve->release_list_lock);
> +}
>
> /* under ve->op_sem write-lock */
> static int ve_start_container(struct ve_struct *ve)
> @@ -579,6 +635,8 @@ err_list:
> return err;
> }
>
> +extern void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
> +
> void ve_stop_ns(struct pid_namespace *pid_ns)
> {
> struct ve_struct *ve = current->task_ve;
> @@ -590,19 +648,44 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
> if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
> return;
>
> - wait_khelpers();
> -
> down_write(&ve->op_sem);
> +
> + /*
> + * Zero-out cgroup->ve_owner for all ve roots.
> + * After this is done cgroups will not be able to
> + * pass through check_for_release into this ve's
> + * release_list.
> + */
> + cgroup_unbind_roots_from_ve(ve);
> +
> /*
> * Here the VE changes its state into "not running".
> * op_sem works as barrier for vzctl ioctls.
> * ve_mutex works as barrier for ve_can_attach().
> + * is_running = 0 also blocks further insertions to
> + * release_list.
> */
> ve->is_running = 0;
>
> + up_write(&ve->op_sem);
> +
> + /*
> + * Stopping workqueue flushes all the tasks there as well as
> + * disables all further work inserions into it. The ordering
> + * relative to wait_khelpers does not matter.
> + */
> ve_workqueue_stop(ve);
>
> /*
> + * Wait until all calls to call_usermodehelper_by
> + * finish to ensure that any usermode tasks launched before
> + * ve->is_running was set to 0 has finished after this line.
> + */
> + wait_khelpers();
The call to wait_khelpers wants to wait finishing of all
call_usermodehelper_by calls started before ve->init_task->flags was set
to PF_EXITING.
As you mentioned in slack, we actually should wait not all
call_usermodehelper_by but only once related to current ve. But it does
not depend on is_running as you write in comment AFAICS.
> +
> + down_write(&ve->op_sem);
> +
> + /*
> * Neither it can be in pseudosuper state
> * anymore, setup it again if needed.
> */
> @@ -698,6 +781,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
> if (!ve->ve_name)
> goto err_name;
>
> + INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
> + raw_spin_lock_init(&ve->release_list_lock);
> +
> ve->_randomize_va_space = ve0._randomize_va_space;
>
> ve->features = VE_FEATURES_DEF;
> @@ -732,6 +818,7 @@ do_init:
> INIT_LIST_HEAD(&ve->devices);
> INIT_LIST_HEAD(&ve->ve_list);
> INIT_LIST_HEAD(&ve->devmnt_list);
> + INIT_LIST_HEAD(&ve->release_list);
> mutex_init(&ve->devmnt_mutex);
>
> #ifdef CONFIG_AIO
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list