[Devel] [PATCH RHEL7 COMMIT] sched: Port diff-sched-make-nr_cpus-limit-support-hierarchies

Thu Jun 4 06:14:15 PDT 2015

The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.9
------>
commit fa30b7498bba8fda3d43e7043f31894a92a53c0e
Author: Vladimir Davydov <vdavydov at parallels.com>
Date:   Thu Jun 4 17:14:15 2015 +0400

    sched: Port diff-sched-make-nr_cpus-limit-support-hierarchies
    
    Author: Vladimir Davydov
    Email: vdavydov at parallels.com
    Subject: sched: make nr_cpus limit support hierarchies
    Date: Wed, 26 Nov 2014 17:29:31 +0300
    
    Before the patch nr_cpus limiter was completely unaware of hierarchies,
    i.e. creation of a cpu sub-cgroup in a container would result in its
    tasks being spread over all physical cpus irrespective of the
    container's nr_cpus limit.
    
    This patch changes this. It makes sub-cgroups created inside a
    container's cpu cgroup respect the containers nr_cpus limit. For
    example, if a container has nr_cpus=2, all its tasks should be running
    on 2 physical cpus most of time even if some of the tasks were moved
    into a cpu sub-cgroup.
    
    However, nr_cpus inside a cpu sub-cgroup of a container still do not
    limit parallelism, only the total cpu time granted to the sub-cgroup,
    because implementation of fully hierarchical nr_cpus limit will be
    likely to impact performance much.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    =============================================================================
    
    Related to https://jira.sw.ru/browse/PSBM-33642
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 kernel/sched/fair.c  | 157 ++++++++++++++++++++++++++++++++++++---------------
 kernel/sched/sched.h |   1 +
 2 files changed, 111 insertions(+), 47 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecac940..25df080 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -263,6 +263,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return grp->my_q;
 }
 
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+	return se && !se->parent;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+	while (se->parent && se->parent->parent)
+		se = se->parent;
+	return cfs_rq_of(se);
+}
+
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 				       int force_update);
 
@@ -391,6 +406,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 	return NULL;
 }
 
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return false;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+	return cfs_rq_of(se);
+}
+
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -3010,6 +3035,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+		inc_nr_active_cfs_rqs(cfs_rq);
+
 	/*
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
@@ -3130,6 +3158,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
+
+	if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+		dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -4111,10 +4142,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	int boost = check_enqueue_boost(rq, p, flags);
 
-	cfs_rq = task_cfs_rq(p);
-	if (list_empty(&cfs_rq->tasks))
-		inc_nr_active_cfs_rqs(cfs_rq);
-
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
@@ -4182,6 +4209,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	if (task_sleep)
+		flags |= DEQUEUE_TASK_SLEEP;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -4234,10 +4264,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_rq_runnable_avg(rq, 1);
 	}
 	hrtick_update(rq);
-
-	cfs_rq = task_cfs_rq(p);
-	if (list_empty(&cfs_rq->tasks))
-		dec_nr_active_cfs_rqs(cfs_rq, task_sleep);
 }
 
 #ifdef CONFIG_SMP
@@ -4685,31 +4711,37 @@ done:
 	return target;
 }
 
-static inline int cpu_is_runnable(struct task_struct *p, int cpu)
-{
-	return cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu]);
-}
-
-static int select_runnable_cpu(struct task_struct *p, int new_cpu)
+static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
 {
+	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 	struct sched_domain *sd;
 	int prev_cpu = task_cpu(p);
 	int cpu;
 
-	if (cpu_is_runnable(p, new_cpu))
-		return new_cpu;
+	cfs_rq = top_cfs_rq_of(&p->se);
+	if (check_cpulimit_spread(cfs_rq, *new_cpu) > 0)
+		return false;
 
-	if (cpu_is_runnable(p, prev_cpu))
-		return prev_cpu;
+	tg = cfs_rq->tg;
+
+	if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+		return true;
+
+	if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+		*new_cpu = prev_cpu;
+		return true;
+	}
 
-	for_each_domain(new_cpu, sd) {
+	for_each_domain(*new_cpu, sd) {
 		for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
-			if (cpu_is_runnable(p, cpu))
-				return cpu;
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				*new_cpu = cpu;
+				return true;
+			}
 		}
 	}
-
-	return new_cpu;
+	return false;
 }
 
 /*
@@ -4767,10 +4799,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			new_cpu = prev_cpu;
 	}
 
-	if (check_cpulimit_spread(task_cfs_rq(p), new_cpu) <= 0) {
-		new_cpu = select_runnable_cpu(p, new_cpu);
+	if (select_runnable_cpu(p, &new_cpu))
 		goto unlock;
-	}
 
 	if (affine_sd) {
 		new_cpu = select_idle_sibling(p, new_cpu);
@@ -5047,21 +5077,33 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 #if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
 static int cpulimit_balance_cpu_stop(void *data);
 
-static void trigger_cpulimit_balance(struct rq *this_rq, struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
 {
-	int this_cpu = cpu_of(this_rq);
-	int cpu, target_cpu = -1;
+	struct rq *this_rq;
+	struct cfs_rq *cfs_rq;
+	int this_cpu, cpu, target_cpu = -1;
 	struct sched_domain *sd;
 
+	if (!p->se.on_rq)
+		return;
+
+	this_rq = rq_of(cfs_rq_of(&p->se));
+	this_cpu = cpu_of(this_rq);
+
+	cfs_rq = top_cfs_rq_of(&p->se);
+	if (check_cpulimit_spread(cfs_rq, this_cpu) >= 0)
+		return;
+
 	raw_spin_unlock(&this_rq->lock);
 
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
-		for_each_cpu_and(cpu, sched_domain_span(sd), tsk_cpus_allowed(p)) {
+		for_each_cpu_and(cpu, sched_domain_span(sd),
+				 tsk_cpus_allowed(p)) {
 			if (cpu != this_cpu &&
-			    cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+			    cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
 				target_cpu = cpu;
 				goto unlock;
 			}
@@ -5084,8 +5126,7 @@ unlock:
 	}
 }
 #else
-static inline void trigger_cpulimit_balance(struct rq *this_rq,
-					    struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
 {
 }
 #endif
@@ -5103,9 +5144,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 		put_prev_entity(cfs_rq, se);
 	}
 
-	if (prev->se.on_rq &&
-	    check_cpulimit_spread(task_cfs_rq(prev), cpu_of(rq)) < 0)
-		trigger_cpulimit_balance(rq, prev);
+	trigger_cpulimit_balance(prev);
 }
 
 /*
@@ -5435,27 +5474,27 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
+	struct cfs_rq *cfs_rq = top_cfs_rq_of(&p->se);
 	int tsk_cache_hot = 0;
 
-	if (check_cpulimit_spread(task_cfs_rq(p), env->dst_cpu) < 0) {
+	if (check_cpulimit_spread(cfs_rq, env->dst_cpu) < 0) {
 		int cpu;
 
 		schedstat_inc(p, se.statistics.nr_failed_migrations_cpulimit);
 
-		if (check_cpulimit_spread(task_cfs_rq(p), env->src_cpu) != 0)
+		if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0)
 			return 0;
 
 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
 			return 0;
 
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-			if (cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+			if (cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
 				env->flags |= LBF_SOME_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
 			}
 		}
-
 		return 0;
 	}
 
@@ -5683,7 +5722,8 @@ static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
 
 static int move_task_groups(struct lb_env *env)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *top_cfs_rq;
+	struct task_group *tg;
 	unsigned long load;
 	int cur_pulled, pulled = 0;
 
@@ -5691,11 +5731,25 @@ static int move_task_groups(struct lb_env *env)
 		return 0;
 
 	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-		if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0 ||
-		    cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]))
+		tg = cfs_rq->tg;
+		if (tg == &root_task_group)
+			continue;
+		/*
+		 * A child always goes before its parent in a leaf_cfs_rq_list.
+		 * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+		 * we could not migrate the child and therefore we should not
+		 * even try to migrate the parent.
+		 */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+
+		top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+				top_cfs_rq_of(tg->se[env->src_cpu]);
+		if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) != 0 ||
+		    cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]))
 			continue;
 
-		load = entity_h_load(cfs_rq->tg->se[env->src_cpu]);
+		load = entity_h_load(top_cfs_rq->tg->se[env->src_cpu]);
 		if ((load / 2) > env->imbalance)
 			continue;
 
@@ -5718,12 +5772,21 @@ static int move_task_groups(struct lb_env *env)
 
 static int do_cpulimit_balance(struct lb_env *env)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *top_cfs_rq;
+	struct task_group *tg;
 	int pushed = 0;
 
 	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-		if (check_cpulimit_spread(cfs_rq, env->src_cpu) < 0 &&
-		    cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
+		tg = cfs_rq->tg;
+		if (tg == &root_task_group)
+			continue;
+		/* see move_task_groups for why we skip such groups */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+		top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+				top_cfs_rq_of(tg->se[env->src_cpu]);
+		if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) < 0 &&
+		    cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
 		    can_migrate_task_group(cfs_rq, env))
 			pushed += move_task_group(cfs_rq, env);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4f513b..8fac301 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1043,6 +1043,7 @@ static const u32 prio_to_wmult[40] = {
 #define ENQUEUE_BOOST		8
 
 #define DEQUEUE_SLEEP		1
+#define DEQUEUE_TASK_SLEEP	2
 
 struct sched_class {
 	const struct sched_class *next;