[Devel] [PATCH rh7 4/4] sched: use topmost limited ancestor for cpulimit balancing

Wed Jul 13 09:37:18 PDT 2016

We want to keep all proceses of a container's cgroup packed on the
minimal allowed number of cpus, which is set by the cpulimit. Doing this
properly when deep hierarchies are used is tricky if not impossible w/o
introducing tremendous overhead, so initially we implemented this
feature exclusively for top-level cgroups. Now this isn't enough, as
containers can be created in machine.slice. So in this patch we make
cpulimit balancing work for topmost cgroups that has a cpu limit set.
This way, no matter if containers are created under the root or in
machine.slice, cpulimit balancing will always be applied to container's
cgroup as machine.slice isn't supposed to have cpu limit set.

https://jira.sw.ru/browse/PSBM-49203

Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
---
 kernel/sched/core.c  | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c  | 36 +++++-------------------------
 kernel/sched/sched.h |  2 ++
 3 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94deef41f05a..657b8e4ba8d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7557,6 +7557,10 @@ void __init sched_init(void)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_CFS_CPULIMIT
+	root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -7882,6 +7886,8 @@ err:
 	return ERR_PTR(-ENOMEM);
 }
 
+static void tg_update_topmost_limited_ancestor(struct task_group *tg);
+
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
 	unsigned long flags;
@@ -7894,6 +7900,9 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
+
+	tg_update_topmost_limited_ancestor(tg);
+
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 
@@ -8428,6 +8437,8 @@ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
+static void tg_limit_toggled(struct task_group *tg);
+
 /* call with cfs_constraints_mutex held */
 static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
@@ -8485,6 +8496,8 @@ static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
+	if (runtime_enabled != runtime_was_enabled)
+		tg_limit_toggled(tg);
 	return ret;
 }
 
@@ -8662,6 +8675,49 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 }
 
 #ifdef CONFIG_CFS_CPULIMIT
+static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void *unused)
+{
+	struct task_group *parent = tg->parent;
+
+	/*
+	 * Parent and none of its uncestors is limited? The task group should
+	 * become a topmost limited uncestor then, provided it has a limit set.
+	 * Otherwise inherit topmost limited ancestor from the parent.
+	 */
+	if (parent->topmost_limited_ancestor == parent &&
+	    parent->cfs_bandwidth.quota == RUNTIME_INF)
+		tg->topmost_limited_ancestor = tg;
+	else
+		tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
+	return 0;
+}
+
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+	__tg_update_topmost_limited_ancestor(tg, NULL);
+}
+
+static void tg_limit_toggled(struct task_group *tg)
+{
+	if (tg->topmost_limited_ancestor != tg) {
+		/*
+		 * This task group is not a topmost limited ancestor, so both
+		 * it and all its children must already point to their topmost
+		 * limited ancestor, and we have nothing to do.
+		 */
+		return;
+	}
+
+	/*
+	 * This task group is a topmost limited ancestor. Walk over all its
+	 * children and update their pointers to the topmost limited ancestor.
+	 */
+
+	spin_lock_irq(&task_group_lock);
+	walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, NULL);
+	spin_unlock_irq(&task_group_lock);
+}
+
 static void tg_update_cpu_limit(struct task_group *tg)
 {
 	long quota, period;
@@ -8736,6 +8792,12 @@ static int nr_cpus_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 	return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
 }
 #else
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+}
+static void tg_limit_toggled(struct task_group *tg)
+{
+}
 static void tg_update_cpu_limit(struct task_group *tg)
 {
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ff38fc1d600..515685f77217 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -265,20 +265,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return grp->my_q;
 }
 
-static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
-{
-	struct task_group *tg = cfs_rq->tg;
-
-	return tg->parent == &root_task_group;
-}
-
-static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
-{
-	while (se->parent && se->parent->parent)
-		se = se->parent;
-	return cfs_rq_of(se);
-}
-
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 				       int force_update);
 
@@ -397,16 +383,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return NULL;
 }
 
-static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
-{
-	return false;
-}
-
-static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
-{
-	return cfs_rq_of(se);
-}
-
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -4707,7 +4683,7 @@ static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
 	int prev_cpu = task_cpu(p);
 	int cpu;
 
-	tg = top_cfs_rq_of(&p->se)->tg;
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
 	if (check_cpulimit_spread(tg, *new_cpu) > 0)
 		return false;
 
@@ -5077,7 +5053,7 @@ static inline void trigger_cpulimit_balance(struct task_struct *p)
 	if (!p->se.on_rq || this_rq->active_balance)
 		return;
 
-	tg = top_cfs_rq_of(&p->se)->tg;
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
 	if (check_cpulimit_spread(tg, this_cpu) >= 0)
 		return;
 
@@ -5451,7 +5427,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 static inline int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
 {
 #ifdef CONFIG_CFS_CPULIMIT
-	struct task_group *tg = top_cfs_rq_of(&p->se)->tg;
+	struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
 
 	if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
 		int cpu;
@@ -5732,8 +5708,7 @@ static int move_task_groups(struct lb_env *env)
 		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
 			continue;
 
-		tg = is_top_cfs_rq(cfs_rq) ? cfs_rq->tg :
-				top_cfs_rq_of(cfs_rq->tg->se[env->src_cpu])->tg;
+		tg = cfs_rq->tg->topmost_limited_ancestor;
 
 		if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
 		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
@@ -5772,8 +5747,7 @@ static int do_cpulimit_balance(struct lb_env *env)
 		/* see move_task_groups for why we skip such groups */
 		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
 			continue;
-		tg = is_top_cfs_rq(cfs_rq) ? cfs_rq->tg :
-				top_cfs_rq_of(cfs_rq->tg->se[env->src_cpu])->tg;
+		tg = cfs_rq->tg->topmost_limited_ancestor;
 		if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
 		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
 		    can_migrate_task_group(cfs_rq, env))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8ff06a779a3d..2bdf80bb2fd4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -182,6 +182,8 @@ struct task_group {
 	unsigned long cpu_rate;
 	unsigned int nr_cpus;
 	atomic_t nr_cpus_active;
+	struct task_group *topmost_limited_ancestor; /* self if none of the
+							ancestors is limited */
 #endif
 
 #if defined(CONFIG_FAIR_GROUP_SCHED)
-- 
2.1.4