[Devel] [PATCH RHEL7 COMMIT] sched: Port diff-sched-make-nr_cpus-limit-support-hierarchies
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jun 4 06:14:15 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.9
------>
commit fa30b7498bba8fda3d43e7043f31894a92a53c0e
Author: Vladimir Davydov <vdavydov at parallels.com>
Date: Thu Jun 4 17:14:15 2015 +0400
sched: Port diff-sched-make-nr_cpus-limit-support-hierarchies
Author: Vladimir Davydov
Email: vdavydov at parallels.com
Subject: sched: make nr_cpus limit support hierarchies
Date: Wed, 26 Nov 2014 17:29:31 +0300
Before the patch nr_cpus limiter was completely unaware of hierarchies,
i.e. creation of a cpu sub-cgroup in a container would result in its
tasks being spread over all physical cpus irrespective of the
container's nr_cpus limit.
This patch changes this. It makes sub-cgroups created inside a
container's cpu cgroup respect the containers nr_cpus limit. For
example, if a container has nr_cpus=2, all its tasks should be running
on 2 physical cpus most of time even if some of the tasks were moved
into a cpu sub-cgroup.
However, nr_cpus inside a cpu sub-cgroup of a container still do not
limit parallelism, only the total cpu time granted to the sub-cgroup,
because implementation of fully hierarchical nr_cpus limit will be
likely to impact performance much.
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
=============================================================================
Related to https://jira.sw.ru/browse/PSBM-33642
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
kernel/sched/fair.c | 157 ++++++++++++++++++++++++++++++++++++---------------
kernel/sched/sched.h | 1 +
2 files changed, 111 insertions(+), 47 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecac940..25df080 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -263,6 +263,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ return se && !se->parent;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+ while (se->parent && se->parent->parent)
+ se = se->parent;
+ return cfs_rq_of(se);
+}
+
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update);
@@ -391,6 +406,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
+static inline bool is_top_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
+static inline struct cfs_rq *top_cfs_rq_of(struct sched_entity *se)
+{
+ return cfs_rq_of(se);
+}
+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
@@ -3010,6 +3035,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+ inc_nr_active_cfs_rqs(cfs_rq);
+
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
@@ -3130,6 +3158,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
+
+ if (is_top_cfs_rq(cfs_rq) && !cfs_rq->load.weight)
+ dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
}
/*
@@ -4111,10 +4142,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int boost = check_enqueue_boost(rq, p, flags);
- cfs_rq = task_cfs_rq(p);
- if (list_empty(&cfs_rq->tasks))
- inc_nr_active_cfs_rqs(cfs_rq);
-
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -4182,6 +4209,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int boosted = entity_boosted(se);
int task_sleep = flags & DEQUEUE_SLEEP;
+ if (task_sleep)
+ flags |= DEQUEUE_TASK_SLEEP;
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -4234,10 +4264,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
-
- cfs_rq = task_cfs_rq(p);
- if (list_empty(&cfs_rq->tasks))
- dec_nr_active_cfs_rqs(cfs_rq, task_sleep);
}
#ifdef CONFIG_SMP
@@ -4685,31 +4711,37 @@ done:
return target;
}
-static inline int cpu_is_runnable(struct task_struct *p, int cpu)
-{
- return cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu]);
-}
-
-static int select_runnable_cpu(struct task_struct *p, int new_cpu)
+static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
{
+ struct cfs_rq *cfs_rq;
+ struct task_group *tg;
struct sched_domain *sd;
int prev_cpu = task_cpu(p);
int cpu;
- if (cpu_is_runnable(p, new_cpu))
- return new_cpu;
+ cfs_rq = top_cfs_rq_of(&p->se);
+ if (check_cpulimit_spread(cfs_rq, *new_cpu) > 0)
+ return false;
- if (cpu_is_runnable(p, prev_cpu))
- return prev_cpu;
+ tg = cfs_rq->tg;
+
+ if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+ return true;
+
+ if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+ *new_cpu = prev_cpu;
+ return true;
+ }
- for_each_domain(new_cpu, sd) {
+ for_each_domain(*new_cpu, sd) {
for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
- if (cpu_is_runnable(p, cpu))
- return cpu;
+ if (cfs_rq_active(tg->cfs_rq[cpu])) {
+ *new_cpu = cpu;
+ return true;
+ }
}
}
-
- return new_cpu;
+ return false;
}
/*
@@ -4767,10 +4799,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = prev_cpu;
}
- if (check_cpulimit_spread(task_cfs_rq(p), new_cpu) <= 0) {
- new_cpu = select_runnable_cpu(p, new_cpu);
+ if (select_runnable_cpu(p, &new_cpu))
goto unlock;
- }
if (affine_sd) {
new_cpu = select_idle_sibling(p, new_cpu);
@@ -5047,21 +5077,33 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
static int cpulimit_balance_cpu_stop(void *data);
-static void trigger_cpulimit_balance(struct rq *this_rq, struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
{
- int this_cpu = cpu_of(this_rq);
- int cpu, target_cpu = -1;
+ struct rq *this_rq;
+ struct cfs_rq *cfs_rq;
+ int this_cpu, cpu, target_cpu = -1;
struct sched_domain *sd;
+ if (!p->se.on_rq)
+ return;
+
+ this_rq = rq_of(cfs_rq_of(&p->se));
+ this_cpu = cpu_of(this_rq);
+
+ cfs_rq = top_cfs_rq_of(&p->se);
+ if (check_cpulimit_spread(cfs_rq, this_cpu) >= 0)
+ return;
+
raw_spin_unlock(&this_rq->lock);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- for_each_cpu_and(cpu, sched_domain_span(sd), tsk_cpus_allowed(p)) {
+ for_each_cpu_and(cpu, sched_domain_span(sd),
+ tsk_cpus_allowed(p)) {
if (cpu != this_cpu &&
- cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+ cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
target_cpu = cpu;
goto unlock;
}
@@ -5084,8 +5126,7 @@ unlock:
}
}
#else
-static inline void trigger_cpulimit_balance(struct rq *this_rq,
- struct task_struct *p)
+static inline void trigger_cpulimit_balance(struct task_struct *p)
{
}
#endif
@@ -5103,9 +5144,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
put_prev_entity(cfs_rq, se);
}
- if (prev->se.on_rq &&
- check_cpulimit_spread(task_cfs_rq(prev), cpu_of(rq)) < 0)
- trigger_cpulimit_balance(rq, prev);
+ trigger_cpulimit_balance(prev);
}
/*
@@ -5435,27 +5474,27 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
+ struct cfs_rq *cfs_rq = top_cfs_rq_of(&p->se);
int tsk_cache_hot = 0;
- if (check_cpulimit_spread(task_cfs_rq(p), env->dst_cpu) < 0) {
+ if (check_cpulimit_spread(cfs_rq, env->dst_cpu) < 0) {
int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_cpulimit);
- if (check_cpulimit_spread(task_cfs_rq(p), env->src_cpu) != 0)
+ if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0)
return 0;
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cfs_rq_active(task_cfs_rq(p)->tg->cfs_rq[cpu])) {
+ if (cfs_rq_active(cfs_rq->tg->cfs_rq[cpu])) {
env->flags |= LBF_SOME_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
-
return 0;
}
@@ -5683,7 +5722,8 @@ static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
static int move_task_groups(struct lb_env *env)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *top_cfs_rq;
+ struct task_group *tg;
unsigned long load;
int cur_pulled, pulled = 0;
@@ -5691,11 +5731,25 @@ static int move_task_groups(struct lb_env *env)
return 0;
for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
- if (check_cpulimit_spread(cfs_rq, env->src_cpu) != 0 ||
- cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]))
+ tg = cfs_rq->tg;
+ if (tg == &root_task_group)
+ continue;
+ /*
+ * A child always goes before its parent in a leaf_cfs_rq_list.
+ * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+ * we could not migrate the child and therefore we should not
+ * even try to migrate the parent.
+ */
+ if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+ continue;
+
+ top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+ top_cfs_rq_of(tg->se[env->src_cpu]);
+ if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) != 0 ||
+ cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]))
continue;
- load = entity_h_load(cfs_rq->tg->se[env->src_cpu]);
+ load = entity_h_load(top_cfs_rq->tg->se[env->src_cpu]);
if ((load / 2) > env->imbalance)
continue;
@@ -5718,12 +5772,21 @@ static int move_task_groups(struct lb_env *env)
static int do_cpulimit_balance(struct lb_env *env)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *top_cfs_rq;
+ struct task_group *tg;
int pushed = 0;
for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
- if (check_cpulimit_spread(cfs_rq, env->src_cpu) < 0 &&
- cfs_rq_active(cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
+ tg = cfs_rq->tg;
+ if (tg == &root_task_group)
+ continue;
+ /* see move_task_groups for why we skip such groups */
+ if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+ continue;
+ top_cfs_rq = is_top_cfs_rq(cfs_rq) ? cfs_rq :
+ top_cfs_rq_of(tg->se[env->src_cpu]);
+ if (check_cpulimit_spread(top_cfs_rq, env->src_cpu) < 0 &&
+ cfs_rq_active(top_cfs_rq->tg->cfs_rq[env->dst_cpu]) &&
can_migrate_task_group(cfs_rq, env))
pushed += move_task_group(cfs_rq, env);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4f513b..8fac301 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1043,6 +1043,7 @@ static const u32 prio_to_wmult[40] = {
#define ENQUEUE_BOOST 8
#define DEQUEUE_SLEEP 1
+#define DEQUEUE_TASK_SLEEP 2
struct sched_class {
const struct sched_class *next;
More information about the Devel
mailing list