[Devel] [PATCH vz9 12/23] ve/sched: Link VE root cpu cgroups in separate list

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Fri Oct 1 18:53:20 MSK 2021


From: Kirill Tkhai <ktkhai at virtuozzo.com>

The idea is to link small number of VE root cpu cgroups
to a separate list. This allows to avoid unnecessary
calculations of loadavg for VE children cpu cgroups
in next patches, and it should positively improve
the performance of calc_load_ve().

https://jira.sw.ru/browse/PSBM-81572

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

(cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

+++
ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Container's cpu cgroup is linked to "ve_root_list" on CT start.
But if someone holds CT's cpu cgroup while CT is being stopped,
next CT start tries to create same cpu cgroup (fails, already exists)
and links this cpu cgroup to the "ve_root_list", thus corrupting it.

As a consequence calc_load_ve() goes in an endless loop.

Let's check if task_group has been already linked to the list and skip
redundant linking.

Locking scheme change:
- drop rcu for list ve_root_list, we hold spinlocks anyway
- use "load_ve_lock" spinlock for both list add/del/iterate,
  "task_group_lock" is unrelated here

How to reproduce:

 # vzctl start 200
 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
 # vzctl stop 200
 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
 // and linked to "ve_root_list" list

 # vzctl start 200
 // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
 // list corruption => endless loop in next calc_load_ve() call

https://jira.sw.ru/browse/PSBM-88251

Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

v2 changes:
 - change locking scheme: drop rcu, use "load_ve_lock" everywhere
 - drop tg->linked field, check if linked using list_empty()

[VvS RHEL77b rebase]

(cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

Cherry-picked from vz8 commit ba87abfb7367 ("ve/sched: Link VE root cpu
cgroups in separate list")).

Cleaned up !CONFIG_VE case in include/linux/sched.h

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 include/linux/sched.h  | 13 +++++++++++++
 kernel/cgroup/cgroup.c |  1 +
 kernel/sched/core.c    | 31 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h   |  4 ++++
 4 files changed, 49 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c91d4777aedd..7b4ef0e90c05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2238,4 +2238,17 @@ static inline void sched_core_free(struct task_struct *tsk) { }
 static inline void sched_core_fork(struct task_struct *p) { }
 #endif
 
+struct cgroup_subsys_state;
+#ifdef CONFIG_VE
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+#else /* CONFIG_VE */
+static inline void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+}
+static inline void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+}
+#endif /* CONFIG_VE */
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 111732c40ffc..08b7cff7a1c3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1920,6 +1920,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
 		cgrp = link->cgrp;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
+	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 unlock:
 	rcu_read_unlock();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b164cd9de057..62a31f1b9cc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9734,6 +9734,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
 
+#ifdef CONFIG_VE
+	INIT_LIST_HEAD(&tg->ve_root_list);
+#endif
 	if (parent)
 		sched_online_group(tg, parent);
 
@@ -9753,6 +9756,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 
+	unlink_ve_root_cpu_cgroup(css);
 	sched_offline_group(tg);
 }
 
@@ -10017,6 +10021,33 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 	return (u64) scale_load_down(tg->shares);
 }
 
+#ifdef CONFIG_VE
+LIST_HEAD(ve_root_list);
+DEFINE_SPINLOCK(load_ve_lock);
+
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	unsigned long flags;
+
+	spin_lock_irqsave(&load_ve_lock, flags);
+	BUG_ON(!(css->flags & CSS_ONLINE));
+	if (list_empty(&tg->ve_root_list))
+		list_add(&tg->ve_root_list, &ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       unsigned long flags;
+
+       spin_lock_irqsave(&load_ve_lock, flags);
+       list_del_init(&tg->ve_root_list);
+       spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0a0ef82744bf..5ea3a045e6d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -430,6 +430,10 @@ struct task_group {
 	struct autogroup	*autogroup;
 #endif
 
+#ifdef CONFIG_VE
+	struct list_head ve_root_list;
+#endif
+
 	/* Monotonic time in nsecs: */
 	u64			start_time;
 
-- 
2.30.2



More information about the Devel mailing list