[Devel] [PATCH RHEL COMMIT] ve/sched: Link VE root cpu cgroups in separate list

Konstantin Khorenko khorenko at virtuozzo.com
Fri Oct 1 19:38:42 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 95ca5376787ae2f4768bb63d0cc9730117295548
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Fri Oct 1 19:38:42 2021 +0300

    ve/sched: Link VE root cpu cgroups in separate list
    
    The idea is to link small number of VE root cpu cgroups
    to a separate list. This allows to avoid unnecessary
    calculations of loadavg for VE children cpu cgroups
    in next patches, and it should positively improve
    the performance of calc_load_ve().
    
    https://jira.sw.ru/browse/PSBM-81572
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    (cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    +++
    ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
    
    Container's cpu cgroup is linked to "ve_root_list" on CT start.
    But if someone holds CT's cpu cgroup while CT is being stopped,
    next CT start tries to create same cpu cgroup (fails, already exists)
    and links this cpu cgroup to the "ve_root_list", thus corrupting it.
    
    As a consequence calc_load_ve() goes in an endless loop.
    
    Let's check if task_group has been already linked to the list and skip
    redundant linking.
    
    Locking scheme change:
    - drop rcu for list ve_root_list, we hold spinlocks anyway
    - use "load_ve_lock" spinlock for both list add/del/iterate,
      "task_group_lock" is unrelated here
    
    How to reproduce:
    
     # vzctl start 200
     # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
     # vzctl stop 200
     // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
     // and linked to "ve_root_list" list
    
     # vzctl start 200
     // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
     // list corruption => endless loop in next calc_load_ve() call
    
    https://jira.sw.ru/browse/PSBM-88251
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    v2 changes:
     - change locking scheme: drop rcu, use "load_ve_lock" everywhere
     - drop tg->linked field, check if linked using list_empty()
    
    [VvS RHEL77b rebase]
    
    (cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    Cherry-picked from vz8 commit ba87abfb7367 ("ve/sched: Link VE root cpu
    cgroups in separate list")).
    
    Cleaned up !CONFIG_VE case in include/linux/sched.h
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/linux/sched.h  | 13 +++++++++++++
 kernel/cgroup/cgroup.c |  1 +
 kernel/sched/core.c    | 31 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h   |  4 ++++
 4 files changed, 49 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c91d4777aedd..7b4ef0e90c05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2238,4 +2238,17 @@ static inline void sched_core_free(struct task_struct *tsk) { }
 static inline void sched_core_fork(struct task_struct *p) { }
 #endif
 
+struct cgroup_subsys_state;
+#ifdef CONFIG_VE
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+#else /* CONFIG_VE */
+static inline void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+}
+static inline void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+}
+#endif /* CONFIG_VE */
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 111732c40ffc..08b7cff7a1c3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1920,6 +1920,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
 		cgrp = link->cgrp;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
+	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 unlock:
 	rcu_read_unlock();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b164cd9de057..62a31f1b9cc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9734,6 +9734,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
 
+#ifdef CONFIG_VE
+	INIT_LIST_HEAD(&tg->ve_root_list);
+#endif
 	if (parent)
 		sched_online_group(tg, parent);
 
@@ -9753,6 +9756,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 
+	unlink_ve_root_cpu_cgroup(css);
 	sched_offline_group(tg);
 }
 
@@ -10017,6 +10021,33 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 	return (u64) scale_load_down(tg->shares);
 }
 
+#ifdef CONFIG_VE
+LIST_HEAD(ve_root_list);
+DEFINE_SPINLOCK(load_ve_lock);
+
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	unsigned long flags;
+
+	spin_lock_irqsave(&load_ve_lock, flags);
+	BUG_ON(!(css->flags & CSS_ONLINE));
+	if (list_empty(&tg->ve_root_list))
+		list_add(&tg->ve_root_list, &ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       unsigned long flags;
+
+       spin_lock_irqsave(&load_ve_lock, flags);
+       list_del_init(&tg->ve_root_list);
+       spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0a0ef82744bf..5ea3a045e6d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -430,6 +430,10 @@ struct task_group {
 	struct autogroup	*autogroup;
 #endif
 
+#ifdef CONFIG_VE
+	struct list_head ve_root_list;
+#endif
+
 	/* Monotonic time in nsecs: */
 	u64			start_time;
 


More information about the Devel mailing list