[Devel] [PATCH RH9 08/12] sched: Port CONFIG_CFS_CPULIMIT feature
Alexander Mikhalitsyn
alexander.mikhalitsyn at virtuozzo.com
Thu Sep 23 14:31:32 MSK 2021
From: Kirill Tkhai <ktkhai at virtuozzo.com>
Add posibility to limit cpus used by cgroup/container.
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
+++
sched: Allow configuring sched_vcpu_hotslice and sched_cpulimit_scale_cpufreq
Let's make our sysctls ported from vz8 to be really configurable.
These are lost hunks from vz7 commits:
f06fef25c0859 ("sched: Add cpulimit base interfaces")
4805ea1432210 ("ve/sched: port vcpu hotslice")
https://jira.sw.ru/browse/PSBM-127780
mFixes: ddbb18ac80519 ("sched: Port CONFIG_CFS_CPULIMIT feature")
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
+++
kernel/sched/fair.c: Add missing update_rq_clock() calls
We've got a hard lockup which seems to be caused by mgag200
console printk code calling to schedule_work from scheduler
with rq->lock held:
#5 [ffffb79e034239a8] native_queued_spin_lock_slowpath at ffffffff8b50c6c6
#6 [ffffb79e034239a8] _raw_spin_lock at ffffffff8bc96e5c
#7 [ffffb79e034239b0] try_to_wake_up at ffffffff8b4e26ff
#8 [ffffb79e03423a10] __queue_work at ffffffff8b4ce3f3
#9 [ffffb79e03423a58] queue_work_on at ffffffff8b4ce714
#10 [ffffb79e03423a68] mga_imageblit at ffffffffc026d666 [mgag200]
#11 [ffffb79e03423a80] soft_cursor at ffffffff8b8a9d84
#12 [ffffb79e03423ad8] bit_cursor at ffffffff8b8a99b2
#13 [ffffb79e03423ba0] hide_cursor at ffffffff8b93bc7a
#14 [ffffb79e03423bb0] vt_console_print at ffffffff8b93e07d
#15 [ffffb79e03423c18] console_unlock at ffffffff8b518f0e
#16 [ffffb79e03423c68] vprintk_emit_log at ffffffff8b51acf7
#17 [ffffb79e03423cc0] vprintk_default at ffffffff8b51adcd
#18 [ffffb79e03423cd0] printk at ffffffff8b51b3d6
#19 [ffffb79e03423d30] __warn_printk at ffffffff8b4b13a0
#20 [ffffb79e03423d98] assert_clock_updated at ffffffff8b4dd293
#21 [ffffb79e03423da0] deactivate_task at ffffffff8b4e12d1
#22 [ffffb79e03423dc8] move_task_group at ffffffff8b4eaa5b
#23 [ffffb79e03423e00] cpulimit_balance_cpu_stop at ffffffff8b4f02f3
#24 [ffffb79e03423eb0] cpu_stopper_thread at ffffffff8b576b67
#25 [ffffb79e03423ee8] smpboot_thread_fn at ffffffff8b4d9125
#26 [ffffb79e03423f10] kthread at ffffffff8b4d4fc2
#27 [ffffb79e03423f50] ret_from_fork at ffffffff8be00255
The printk called because assert_clock_updated() triggered
SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
This means that we missing necessary update_rq_clock() call.
Add one to cpulimit_balance_cpu_stop() to fix the warning.
Also add one in load_balance() before move_task_groups() call.
It seems to be another place missing this call.
https://jira.sw.ru/browse/PSBM-108013
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
+++
kernel/sched/fair.c: Add more missing update_rq_clock() calls
Add update_rq_clock() for 'target_rq' to avoid WARN() coming
from attach_task(). Also add rq_repin_lock(busiest, &rf); in
load_balance() for detach_task(). The update_rq_clock() isn't
necessary since it was updated before, but we need the repin
since rq lock was released after update.
https://jira.sw.ru/browse/PSBM-108013
Reported-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
https://jira.sw.ru/browse/PSBM-133986
See also:
5cb9eaa3d ("sched: Wrap rq::lock access")
36c5bdc43 ("sched/topology: Kill SD_LOAD_BALANCE")
e669ac8ab ("sched: Remove checks against SD_LOAD_BALANCE")
9818427c6 ("sched/debug: Make sd->flags sysctl read-only")
(cherry picked from commit fbafc1d55798fb54805164bb79a99aba859b294d)
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
include/linux/sched.h | 29 +++
include/linux/sched/sysctl.h | 5 +
include/linux/sched/topology.h | 5 +
init/Kconfig | 4 +
kernel/sched/core.c | 44 ++++
kernel/sched/fair.c | 396 +++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 16 ++
kernel/sysctl.c | 19 ++
8 files changed, 518 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31e9e41b9d9d..c91d4777aedd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -451,6 +451,9 @@ struct sched_statistics {
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
+#ifdef CONFIG_CFS_CPULIMIT
+ u64 nr_failed_migrations_cpulimit;
+#endif
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
@@ -471,6 +474,9 @@ struct sched_entity {
struct load_weight load;
struct rb_node run_node;
struct list_head group_node;
+#ifdef CONFIG_CFS_CPULIMIT
+ struct list_head cfs_rq_node;
+#endif
unsigned int on_rq;
u64 exec_start;
@@ -2053,6 +2059,29 @@ static inline bool vcpu_is_preempted(int cpu)
}
#endif
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+ return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+ return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+ return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index db2c0f34aaaf..b6adb2b82e52 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -99,4 +99,9 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
#endif
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+#endif
+
#endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..379fd57f665e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -118,6 +118,11 @@ struct sched_domain {
unsigned int alb_failed;
unsigned int alb_pushed;
+ /* cpulimit balancing */
+ unsigned int clb_count;
+ unsigned int clb_failed;
+ unsigned int clb_pushed;
+
/* SD_BALANCE_EXEC stats */
unsigned int sbe_count;
unsigned int sbe_balanced;
diff --git a/init/Kconfig b/init/Kconfig
index 564553afb251..157a015393ac 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -966,9 +966,13 @@ config FAIR_GROUP_SCHED
depends on CGROUP_SCHED
default CGROUP_SCHED
+config CFS_CPULIMIT
+ bool
+
config CFS_BANDWIDTH
bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
depends on FAIR_GROUP_SCHED
+ select CFS_CPULIMIT
default n
help
This option allows users to define CPU bandwidth rates (limits) for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebb6dd99b442..d824282e942b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -371,6 +371,47 @@ static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
*/
int sysctl_sched_rt_runtime = 950000;
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+ unsigned int nr_cpus = 0;
+ unsigned int max_nr_cpus = num_online_cpus();
+
+ rcu_read_lock();
+ nr_cpus = task_group(p)->nr_cpus;
+ rcu_read_unlock();
+
+ if (!nr_cpus || nr_cpus > max_nr_cpus)
+ nr_cpus = max_nr_cpus;
+
+ return nr_cpus;
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+ return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+ unsigned long rate, max_rate;
+
+ if (!sysctl_sched_cpulimit_scale_cpufreq)
+ return freq;
+
+ rcu_read_lock();
+ rate = task_group(current)->cpu_rate;
+ rcu_read_unlock();
+
+ max_rate = num_online_vcpus() * MAX_CPU_RATE;
+ if (!rate || rate >= max_rate)
+ return freq;
+
+ return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
/*
* Serialization rules:
@@ -9085,6 +9126,9 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+#ifdef CONFIG_CFS_CPULIMIT
+ root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fb30663db2fe..c42ff00885c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -134,6 +134,11 @@ int __weak arch_asym_cpu_priority(int cpu)
* (default: 5 msec, units: microseconds)
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+
+#endif
+
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -470,6 +475,88 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CFS_CPULIMIT
+static int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+ /* if we canceled delayed dec, there is no need to do inc */
+ if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+ atomic_inc(&cfs_rq->tg->nr_cpus_active);
+ cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+ if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+ postpone = 0;
+
+ if (!postpone) {
+ cfs_rq->active = 0;
+ atomic_dec(&cfs_rq->tg->nr_cpus_active);
+ } else {
+ hrtimer_start_range_ns(&cfs_rq->active_timer,
+ ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+ HRTIMER_MODE_REL_PINNED);
+ }
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+ struct cfs_rq *cfs_rq =
+ container_of(timer, struct cfs_rq, active_timer);
+ struct rq *rq = rq_of(cfs_rq);
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+ cfs_rq->active = !list_empty(&cfs_rq->tasks);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+ return HRTIMER_NORESTART;
+}
+
+static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+ int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+ int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+
+ nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
+ min_t(int, nr_cpus_limit, tg->nr_cpus) :
+ max_t(int, nr_cpus_limit, tg->nr_cpus);
+
+ if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+ return 1;
+
+ if (nr_cpus_active > nr_cpus_limit)
+ return -1;
+
+ return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+
+static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+ return 0;
+}
+
+static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+ return 1;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -2960,6 +3047,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
+#ifdef CONFIG_CFS_CPULIMIT
+ list_add(&se->cfs_rq_node, &cfs_rq->tasks);
+#endif
}
#endif
cfs_rq->nr_running++;
@@ -2973,6 +3063,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
+#ifdef CONFIG_CFS_CPULIMIT
+ list_del(&se->cfs_rq_node);
+#endif
}
#endif
cfs_rq->nr_running--;
@@ -4251,6 +4344,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
+ if (!cfs_rq->load.weight)
+ inc_nr_active_cfs_rqs(cfs_rq);
/*
* If we're the current task, we must renormalise before calling
* update_curr().
@@ -4408,6 +4503,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+
+ if (!cfs_rq->load.weight)
+ dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
}
/*
@@ -5332,6 +5430,10 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_CFS_CPULIMIT
+ hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5727,6 +5829,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+#ifdef CONFIG_CFS_CPULIMIT
+static DEFINE_PER_CPU(struct callback_head, cpulimit_cb_head);
+#endif
#ifdef CONFIG_NO_HZ_COMMON
@@ -6844,6 +6949,38 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
return target;
}
+static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+ struct task_group *tg;
+ struct sched_domain *sd;
+ int prev_cpu = task_cpu(p);
+ int cpu;
+
+ tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+ if (check_cpulimit_spread(tg, *new_cpu) > 0)
+ return false;
+
+ if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+ return true;
+
+ if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+ *new_cpu = prev_cpu;
+ return true;
+ }
+
+ for_each_domain(*new_cpu, sd) {
+ for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
+ if (cfs_rq_active(tg->cfs_rq[cpu])) {
+ *new_cpu = cpu;
+ return true;
+ }
+ }
+ }
+#endif
+ return false;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6903,6 +7040,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
+ if (select_runnable_cpu(p, &new_cpu))
+ goto unlock;
+
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
@@ -6913,6 +7053,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (want_affine)
current->recent_used_cpu = cpu;
}
+unlock:
rcu_read_unlock();
return new_cpu;
@@ -7195,6 +7336,51 @@ static struct task_struct *pick_task_fair(struct rq *rq)
}
#endif
+#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
+static int cpulimit_balance_cpu_stop(void *data);
+
+static void trigger_cpulimit_balance(struct rq *this_rq)
+{
+ struct task_struct *p = this_rq->curr;
+ struct task_group *tg;
+ int this_cpu, cpu, target_cpu = -1;
+ struct sched_domain *sd;
+
+ this_cpu = cpu_of(this_rq);
+
+ if (!p->se.on_rq || this_rq->active_balance)
+ return;
+
+ tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+ if (check_cpulimit_spread(tg, this_cpu) >= 0)
+ return;
+
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ for_each_cpu_and(cpu, sched_domain_span(sd),
+ p->cpus_ptr) {
+ if (cpu != this_cpu &&
+ cfs_rq_active(tg->cfs_rq[cpu])) {
+ target_cpu = cpu;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ if (target_cpu >= 0) {
+ this_rq->active_balance = 1;
+ this_rq->push_cpu = target_cpu;
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(this_rq->cpu,
+ cpulimit_balance_cpu_stop, this_rq,
+ &this_rq->active_balance_work);
+ raw_spin_rq_lock(this_rq);
+ }
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -7282,6 +7468,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
set_next_entity(cfs_rq, se);
}
+#ifdef CONFIG_CFS_CPULIMIT
+ queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
+#endif
goto done;
simple:
#endif
@@ -7311,6 +7500,9 @@ done: __maybe_unused;
update_misfit_status(p, rq);
+#ifdef CONFIG_CFS_CPULIMIT
+ queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
+#endif
return p;
idle:
@@ -7716,6 +7908,37 @@ static inline int migrate_degrades_locality(struct task_struct *p,
}
#endif
+static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+ struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+
+ if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
+ int cpu;
+
+ schedstat_inc(p->se.statistics.nr_failed_migrations_cpulimit);
+
+ env->flags |= LBF_SOME_PINNED;
+
+ if (check_cpulimit_spread(tg, env->src_cpu) != 0)
+ return 0;
+
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cfs_rq_active(tg->cfs_rq[cpu])) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
+ }
+ return 0;
+ }
+#endif
+ return 1;
+}
+
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
@@ -7726,6 +7949,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_rq_held(env->src_rq);
+ if (!can_migrate_task_cpulimit(p, env))
+ return 0;
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -8087,6 +8312,161 @@ static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
+#ifdef CONFIG_CFS_CPULIMIT
+static unsigned long entity_h_load(struct sched_entity *se);
+
+static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+ struct sched_entity *se;
+ struct task_struct *p;
+
+ list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
+ p = task_of(se);
+ if (task_curr(p) ||
+ !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+ return 1;
+}
+
+static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+ struct sched_entity *se, *tmp;
+ int moved = 0;
+
+ list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
+ struct task_struct *p = task_of(se);
+ detach_task(p, env);
+ attach_task(env->dst_rq, p);
+ moved++;
+ }
+ return moved;
+}
+
+static int move_task_groups(struct lb_env *env)
+{
+ struct cfs_rq *cfs_rq, *pos;
+ struct task_group *tg;
+ unsigned long load;
+ int cur_pulled, pulled = 0;
+
+ if (env->imbalance <= 0)
+ return 0;
+
+ for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+ if (cfs_rq->tg == &root_task_group)
+ continue;
+ /*
+ * A child always goes before its parent in a leaf_cfs_rq_list.
+ * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+ * we could not migrate the child and therefore we should not
+ * even try to migrate the parent.
+ */
+ if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+ continue;
+
+ tg = cfs_rq->tg->topmost_limited_ancestor;
+
+ if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
+ cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
+ continue;
+
+ load = entity_h_load(tg->se[env->src_cpu]);
+ if ((load / 2) > env->imbalance)
+ continue;
+
+ if (!can_migrate_task_group(cfs_rq, env))
+ continue;
+
+ cur_pulled = move_task_group(cfs_rq, env);
+ pulled += cur_pulled;
+ env->imbalance -= load;
+
+ env->loop += cur_pulled;
+ if (env->loop > env->loop_max)
+ break;
+
+ if (env->imbalance <= 0)
+ break;
+ }
+ return pulled;
+}
+
+static int do_cpulimit_balance(struct lb_env *env)
+{
+ struct cfs_rq *cfs_rq, *pos;
+ struct task_group *tg;
+ int pushed = 0;
+
+ for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+ if (cfs_rq->tg == &root_task_group)
+ continue;
+ /* see move_task_groups for why we skip such groups */
+ if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+ continue;
+ tg = cfs_rq->tg->topmost_limited_ancestor;
+ if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
+ cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
+ can_migrate_task_group(cfs_rq, env))
+ pushed += move_task_group(cfs_rq, env);
+ }
+ return pushed;
+}
+
+static int cpulimit_balance_cpu_stop(void *data)
+{
+ struct rq *rq = data;
+ int cpu = cpu_of(rq);
+ int target_cpu = rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_rq_lock_irq(rq);
+
+ if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
+ !cpu_online(target_cpu)))
+ goto out_unlock;
+
+ if (unlikely(!rq->nr_running))
+ goto out_unlock;
+
+ BUG_ON(rq == target_rq);
+
+ double_lock_balance(rq, target_rq);
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ break;
+ }
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = cpu,
+ .src_rq = rq,
+ };
+
+ schedstat_inc(sd->clb_count);
+
+ update_rq_clock(rq);
+ update_rq_clock(target_rq);
+ if (do_cpulimit_balance(&env))
+ schedstat_inc(sd->clb_pushed);
+ else
+ schedstat_inc(sd->clb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(rq, target_rq);
+
+out_unlock:
+ rq->active_balance = 0;
+ raw_spin_rq_unlock_irq(rq);
+ return 0;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
@@ -9812,6 +10192,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
local_irq_restore(rf.flags);
+#ifdef CONFIG_CFS_CPULIMIT
+ if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
+ env.loop = 0;
+ local_irq_save(rf.flags);
+ double_rq_lock(env.dst_rq, busiest);
+ rq_repin_lock(env.src_rq, &rf);
+ update_rq_clock(env.dst_rq);
+ cur_ld_moved = ld_moved = move_task_groups(&env);
+ double_rq_unlock(env.dst_rq, busiest);
+ local_irq_restore(rf.flags);
+ }
+#endif
+
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
@@ -11251,6 +11644,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+#ifdef CONFIG_CFS_CPULIMIT
+ INIT_LIST_HEAD(&cfs_rq->tasks);
+#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ed6e12e3eb65..9cddbc9920f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -433,6 +433,14 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+ unsigned long cpu_rate;
+ unsigned int nr_cpus;
+ atomic_t nr_cpus_active;
+ struct task_group *topmost_limited_ancestor; /* self if none of the
+ ancestors is limited */
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -540,6 +548,9 @@ struct cfs_rq {
#endif
struct rb_root_cached tasks_timeline;
+#ifdef CONFIG_CFS_CPULIMIT
+ struct list_head tasks;
+#endif
/*
* 'curr' points to currently running entity on this cfs_rq.
@@ -613,6 +624,10 @@ struct cfs_rq {
int throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_CFS_CPULIMIT
+ int active;
+ struct hrtimer active_timer;
+#endif /* CONFIG_CFS_CPULIMIT */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -2087,6 +2102,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_TASK_SLEEP 0x10
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a5a6455b363..9a3f06137568 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1865,6 +1865,25 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_CFS_CPULIMIT
+ {
+ .procname = "sched_vcpu_hotslice",
+ .data = &sysctl_sched_vcpu_hotslice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "sched_cpulimit_scale_cpufreq",
+ .data = &sysctl_sched_cpulimit_scale_cpufreq,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
--
2.31.1
More information about the Devel
mailing list