[Devel] [PATCH vz7 v2] ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Kirill Tkhai ktkhai at virtuozzo.com
Wed Sep 5 16:40:50 MSK 2018


On 05.09.2018 16:30, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
> 
> As a consequence calc_load_ve() goes in an endless loop.
> 
> Let's check if task_group has been already linked to the list and skip
> redundant linking.
> 
> Locking scheme change:
> - drop rcu for list ve_root_list, we hold spinlocks anyway
> - use "load_ve_lock" spinlock for both list add/del/iterate,
>   "task_group_lock" is unrelated here
> 
> How to reproduce:
> 
>  # vzctl start 200
>  # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>  # vzctl stop 200
>  // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>  // and linked to "ve_root_list" list
> 
>  # vzctl start 200
>  // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>  // list corruption => endless loop in next calc_load_ve() call
> 
> https://jira.sw.ru/browse/PSBM-88251
> 
> Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
 
> v2 changes:
>  - change locking scheme: drop rcu, use "load_ve_lock" everywhere
>  - drop tg->linked field, check if linked using list_empty()
> ---
>  include/linux/sched.h |  5 ++++-
>  kernel/sched/core.c   | 32 ++++++++++++++++++--------------
>  2 files changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 623572043ece..00d7dbca8ed6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
>  #ifdef CONFIG_VE
>  struct cgroup;
>  extern void link_ve_root_cpu_cgroup(struct cgroup *);
> -#endif
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
> +#else /* CONFIG_VE */
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
> +#endif /* CONFIG_VE */
>  
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e572d2b14a7f..a49ea4e9b718 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2882,10 +2882,10 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
>  
>  #ifdef CONFIG_VE
>  static LIST_HEAD(ve_root_list);
> +static DEFINE_SPINLOCK(load_ve_lock);
>  
>  void calc_load_ve(void)
>  {
> -	static DEFINE_SPINLOCK(load_ve_lock);
>  	unsigned long nr_unint, nr_active;
>  	struct task_group *tg;
>  	int i;
> @@ -2895,8 +2895,7 @@ void calc_load_ve(void)
>  	 * against very rare parallel execution on two or more cpus.
>  	 */
>  	spin_lock(&load_ve_lock);
> -	rcu_read_lock();
> -	list_for_each_entry_rcu(tg, &ve_root_list, ve_root_list) {
> +	list_for_each_entry(tg, &ve_root_list, ve_root_list) {
>  		nr_active = 0;
>  		for_each_possible_cpu(i) {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -2916,7 +2915,6 @@ void calc_load_ve(void)
>  		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
>  		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
>  	}
> -	rcu_read_unlock();
>  
>  	nr_unint = nr_uninterruptible() * FIXED_1;
>  
> @@ -9016,12 +9014,23 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
>  	struct task_group *tg = cgroup_tg(cgrp);
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&task_group_lock, flags);
> +	spin_lock_irqsave(&load_ve_lock, flags);
>  	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
> -	list_add_rcu(&tg->ve_root_list, &ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> +	if (list_empty(&tg->ve_root_list))
> +		list_add(&tg->ve_root_list, &ve_root_list);
> +	spin_unlock_irqrestore(&load_ve_lock, flags);
>  }
> -#endif
> +
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&load_ve_lock, flags);
> +	list_del_init(&tg->ve_root_list);
> +	spin_unlock_irqrestore(&load_ve_lock, flags);
> +}
> +#endif /* CONFIG_VE */
>  
>  static void free_sched_group(struct task_group *tg)
>  {
> @@ -9614,13 +9623,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
>  static void cpu_cgroup_css_offline(struct cgroup *cgrp)
>  {
>  	struct task_group *tg = cgroup_tg(cgrp);
> -#ifdef CONFIG_VE
> -	unsigned long flags;
>  
> -	spin_lock_irqsave(&task_group_lock, flags);
> -	list_del_rcu(&tg->ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> -#endif
> +	unlink_ve_root_cpu_cgroup(cgrp);
>  	sched_offline_group(tg);
>  }
>  
> 


More information about the Devel mailing list