[Devel] [PATCH RHEL8 COMMIT] ve/sched: Link VE root cpu cgroups in separate list

Konstantin Khorenko khorenko at virtuozzo.com
Wed Oct 28 19:19:29 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.14
------>
commit 8faa76f7083792e2eab864c1c50ea531e8d5ed68
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Tue Feb 27 15:28:29 2018 +0300

    ve/sched: Link VE root cpu cgroups in separate list
    
    The idea is to link small number of VE root cpu cgroups
    to a separate list. This allows to avoid unnecessary
    calculations of loadavg for VE children cpu cgroups
    in next patches, and it should positively improve
    the performance of calc_load_ve().
    
    https://jira.sw.ru/browse/PSBM-81572
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    (cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    +++
    ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
    
    Container's cpu cgroup is linked to "ve_root_list" on CT start.
    But if someone holds CT's cpu cgroup while CT is being stopped,
    next CT start tries to create same cpu cgroup (fails, already exists)
    and links this cpu cgroup to the "ve_root_list", thus corrupting it.
    
    As a consequence calc_load_ve() goes in an endless loop.
    
    Let's check if task_group has been already linked to the list and skip
    redundant linking.
    
    Locking scheme change:
    - drop rcu for list ve_root_list, we hold spinlocks anyway
    - use "load_ve_lock" spinlock for both list add/del/iterate,
      "task_group_lock" is unrelated here
    
    How to reproduce:
    
     # vzctl start 200
     # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
     # vzctl stop 200
     // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
     // and linked to "ve_root_list" list
    
     # vzctl start 200
     // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
     // list corruption => endless loop in next calc_load_ve() call
    
    https://jira.sw.ru/browse/PSBM-88251
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    v2 changes:
     - change locking scheme: drop rcu, use "load_ve_lock" everywhere
     - drop tg->linked field, check if linked using list_empty()
    
    [VvS RHEL77b rebase]
    
    (cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
---
 include/linux/sched.h  |  8 ++++++++
 kernel/cgroup/cgroup.c |  1 +
 kernel/sched/core.c    | 31 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h   |  4 ++++
 4 files changed, 44 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4326aa24e9dc..cabed6a47a70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2015,4 +2015,12 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+#ifdef CONFIG_VE
+struct cgroup_subsys_state;
+extern void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) { }
+#endif /* CONFIG_VE */
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 08137d43f3ab..4ee3eb24b0d1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1894,6 +1894,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
 		cgrp = link->cgrp;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
+	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 unlock:
 	rcu_read_unlock();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a57956d64d6..a6100bf3f625 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6581,6 +6581,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
 
+#ifdef CONFIG_VE
+	INIT_LIST_HEAD(&tg->ve_root_list);
+#endif
 	if (parent)
 		sched_online_group(tg, parent);
 	return 0;
@@ -6590,6 +6593,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 
+	unlink_ve_root_cpu_cgroup(css);
 	sched_offline_group(tg);
 }
 
@@ -6677,6 +6681,33 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 	return (u64) scale_load_down(tg->shares);
 }
 
+#ifdef CONFIG_VE
+LIST_HEAD(ve_root_list);
+DEFINE_SPINLOCK(load_ve_lock);
+
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	unsigned long flags;
+
+	spin_lock_irqsave(&load_ve_lock, flags);
+	BUG_ON(!(css->flags & CSS_ONLINE));
+	if (list_empty(&tg->ve_root_list))
+		list_add(&tg->ve_root_list, &ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       unsigned long flags;
+
+       spin_lock_irqsave(&load_ve_lock, flags);
+       list_del_init(&tg->ve_root_list);
+       spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2f0c26b2c50..93bf1d78c27d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,10 @@ struct task_group {
 	struct autogroup	*autogroup;
 #endif
 
+#ifdef CONFIG_VE
+	struct list_head ve_root_list;
+#endif
+
 	/* Monotonic time in nsecs: */
 	u64			start_time;
 


More information about the Devel mailing list