[Devel] [PATCH vz7 v2] ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
Kirill Tkhai
ktkhai at virtuozzo.com
Wed Sep 5 16:40:50 MSK 2018
On 05.09.2018 16:30, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
>
> As a consequence calc_load_ve() goes in an endless loop.
>
> Let's check if task_group has been already linked to the list and skip
> redundant linking.
>
> Locking scheme change:
> - drop rcu for list ve_root_list, we hold spinlocks anyway
> - use "load_ve_lock" spinlock for both list add/del/iterate,
> "task_group_lock" is unrelated here
>
> How to reproduce:
>
> # vzctl start 200
> # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
> # vzctl stop 200
> // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
> // and linked to "ve_root_list" list
>
> # vzctl start 200
> // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
> // list corruption => endless loop in next calc_load_ve() call
>
> https://jira.sw.ru/browse/PSBM-88251
>
> Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> v2 changes:
> - change locking scheme: drop rcu, use "load_ve_lock" everywhere
> - drop tg->linked field, check if linked using list_empty()
> ---
> include/linux/sched.h | 5 ++++-
> kernel/sched/core.c | 32 ++++++++++++++++++--------------
> 2 files changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 623572043ece..00d7dbca8ed6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
> #ifdef CONFIG_VE
> struct cgroup;
> extern void link_ve_root_cpu_cgroup(struct cgroup *);
> -#endif
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
> +#else /* CONFIG_VE */
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
> +#endif /* CONFIG_VE */
>
> #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e572d2b14a7f..a49ea4e9b718 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2882,10 +2882,10 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
>
> #ifdef CONFIG_VE
> static LIST_HEAD(ve_root_list);
> +static DEFINE_SPINLOCK(load_ve_lock);
>
> void calc_load_ve(void)
> {
> - static DEFINE_SPINLOCK(load_ve_lock);
> unsigned long nr_unint, nr_active;
> struct task_group *tg;
> int i;
> @@ -2895,8 +2895,7 @@ void calc_load_ve(void)
> * against very rare parallel execution on two or more cpus.
> */
> spin_lock(&load_ve_lock);
> - rcu_read_lock();
> - list_for_each_entry_rcu(tg, &ve_root_list, ve_root_list) {
> + list_for_each_entry(tg, &ve_root_list, ve_root_list) {
> nr_active = 0;
> for_each_possible_cpu(i) {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -2916,7 +2915,6 @@ void calc_load_ve(void)
> tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
> tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
> }
> - rcu_read_unlock();
>
> nr_unint = nr_uninterruptible() * FIXED_1;
>
> @@ -9016,12 +9014,23 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
> struct task_group *tg = cgroup_tg(cgrp);
> unsigned long flags;
>
> - spin_lock_irqsave(&task_group_lock, flags);
> + spin_lock_irqsave(&load_ve_lock, flags);
> BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
> - list_add_rcu(&tg->ve_root_list, &ve_root_list);
> - spin_unlock_irqrestore(&task_group_lock, flags);
> + if (list_empty(&tg->ve_root_list))
> + list_add(&tg->ve_root_list, &ve_root_list);
> + spin_unlock_irqrestore(&load_ve_lock, flags);
> }
> -#endif
> +
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
> +{
> + struct task_group *tg = cgroup_tg(cgrp);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&load_ve_lock, flags);
> + list_del_init(&tg->ve_root_list);
> + spin_unlock_irqrestore(&load_ve_lock, flags);
> +}
> +#endif /* CONFIG_VE */
>
> static void free_sched_group(struct task_group *tg)
> {
> @@ -9614,13 +9623,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
> static void cpu_cgroup_css_offline(struct cgroup *cgrp)
> {
> struct task_group *tg = cgroup_tg(cgrp);
> -#ifdef CONFIG_VE
> - unsigned long flags;
>
> - spin_lock_irqsave(&task_group_lock, flags);
> - list_del_rcu(&tg->ve_root_list);
> - spin_unlock_irqrestore(&task_group_lock, flags);
> -#endif
> + unlink_ve_root_cpu_cgroup(cgrp);
> sched_offline_group(tg);
> }
>
>
More information about the Devel
mailing list