[Devel] [PATCH rh8 1/6] ve/sched: Link VE root cpu cgroups in separate list

Konstantin Khorenko khorenko at virtuozzo.com
Thu Oct 22 15:54:47 MSK 2020


From: Kirill Tkhai <ktkhai at virtuozzo.com>

The idea is to link small number of VE root cpu cgroups
to a separate list. This allows to avoid unnecessary
calculations of loadavg for VE children cpu cgroups
in next patches, and it should positively improve
the performance of calc_load_ve().

https://jira.sw.ru/browse/PSBM-81572

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

(cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>

+++
ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Container's cpu cgroup is linked to "ve_root_list" on CT start.
But if someone holds CT's cpu cgroup while CT is being stopped,
next CT start tries to create same cpu cgroup (fails, already exists)
and links this cpu cgroup to the "ve_root_list", thus corrupting it.

As a consequence calc_load_ve() goes in an endless loop.

Let's check if task_group has been already linked to the list and skip
redundant linking.

Locking scheme change:
- drop rcu for list ve_root_list, we hold spinlocks anyway
- use "load_ve_lock" spinlock for both list add/del/iterate,
  "task_group_lock" is unrelated here

How to reproduce:

 # vzctl start 200
 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
 # vzctl stop 200
 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
 // and linked to "ve_root_list" list

 # vzctl start 200
 // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
 // list corruption => endless loop in next calc_load_ve() call

https://jira.sw.ru/browse/PSBM-88251

Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

v2 changes:
 - change locking scheme: drop rcu, use "load_ve_lock" everywhere
 - drop tg->linked field, check if linked using list_empty()

[VvS RHEL77b rebase]

(cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
---
 include/linux/sched.h  |  8 ++++++++
 kernel/cgroup/cgroup.c |  1 +
 kernel/sched/core.c    | 31 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h   |  4 ++++
 4 files changed, 44 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4326aa24e9dc..cabed6a47a70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2015,4 +2015,12 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+#ifdef CONFIG_VE
+struct cgroup_subsys_state;
+extern void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) { }
+#endif /* CONFIG_VE */
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 08137d43f3ab..4ee3eb24b0d1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1894,6 +1894,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
 		cgrp = link->cgrp;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
+	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 unlock:
 	rcu_read_unlock();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a57956d64d6..a6100bf3f625 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6581,6 +6581,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
 
+#ifdef CONFIG_VE
+	INIT_LIST_HEAD(&tg->ve_root_list);
+#endif
 	if (parent)
 		sched_online_group(tg, parent);
 	return 0;
@@ -6590,6 +6593,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 
+	unlink_ve_root_cpu_cgroup(css);
 	sched_offline_group(tg);
 }
 
@@ -6677,6 +6681,33 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 	return (u64) scale_load_down(tg->shares);
 }
 
+#ifdef CONFIG_VE
+LIST_HEAD(ve_root_list);
+DEFINE_SPINLOCK(load_ve_lock);
+
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+	unsigned long flags;
+
+	spin_lock_irqsave(&load_ve_lock, flags);
+	BUG_ON(!(css->flags & CSS_ONLINE));
+	if (list_empty(&tg->ve_root_list))
+		list_add(&tg->ve_root_list, &ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+       struct task_group *tg = css_tg(css);
+       unsigned long flags;
+
+       spin_lock_irqsave(&load_ve_lock, flags);
+       list_del_init(&tg->ve_root_list);
+       spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2f0c26b2c50..93bf1d78c27d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,10 @@ struct task_group {
 	struct autogroup	*autogroup;
 #endif
 
+#ifdef CONFIG_VE
+	struct list_head ve_root_list;
+#endif
+
 	/* Monotonic time in nsecs: */
 	u64			start_time;
 
-- 
2.28.0



More information about the Devel mailing list