[Devel] [PATCH RHEL COMMIT] sched: Port CONFIG_CFS_CPULIMIT feature

Fri Sep 24 14:49:32 MSK 2021

The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit cce10f3b8b9d37ded29f40672502c213de4c22f5
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Fri Sep 24 14:49:32 2021 +0300

    sched: Port CONFIG_CFS_CPULIMIT feature
    
    Add posibility to limit cpus used by cgroup/container.
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    +++
    sched: Allow configuring sched_vcpu_hotslice and sched_cpulimit_scale_cpufreq
    
    Let's make our sysctls ported from vz8 to be really configurable.
    
    These are lost hunks from vz7 commits:
    f06fef25c0859 ("sched: Add cpulimit base interfaces")
    4805ea1432210 ("ve/sched: port vcpu hotslice")
    
    https://jira.sw.ru/browse/PSBM-127780
    mFixes: ddbb18ac80519 ("sched: Port CONFIG_CFS_CPULIMIT feature")
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    +++
    kernel/sched/fair.c: Add missing update_rq_clock() calls
    
    We've got a hard lockup which seems to be caused by mgag200
    console printk code calling to schedule_work from scheduler
    with rq->lock held:
    
      #5 [ffffb79e034239a8] native_queued_spin_lock_slowpath at ffffffff8b50c6c6
      #6 [ffffb79e034239a8] _raw_spin_lock at ffffffff8bc96e5c
      #7 [ffffb79e034239b0] try_to_wake_up at ffffffff8b4e26ff
      #8 [ffffb79e03423a10] __queue_work at ffffffff8b4ce3f3
      #9 [ffffb79e03423a58] queue_work_on at ffffffff8b4ce714
     #10 [ffffb79e03423a68] mga_imageblit at ffffffffc026d666 [mgag200]
     #11 [ffffb79e03423a80] soft_cursor at ffffffff8b8a9d84
     #12 [ffffb79e03423ad8] bit_cursor at ffffffff8b8a99b2
     #13 [ffffb79e03423ba0] hide_cursor at ffffffff8b93bc7a
     #14 [ffffb79e03423bb0] vt_console_print at ffffffff8b93e07d
     #15 [ffffb79e03423c18] console_unlock at ffffffff8b518f0e
     #16 [ffffb79e03423c68] vprintk_emit_log at ffffffff8b51acf7
     #17 [ffffb79e03423cc0] vprintk_default at ffffffff8b51adcd
     #18 [ffffb79e03423cd0] printk at ffffffff8b51b3d6
     #19 [ffffb79e03423d30] __warn_printk at ffffffff8b4b13a0
     #20 [ffffb79e03423d98] assert_clock_updated at ffffffff8b4dd293
     #21 [ffffb79e03423da0] deactivate_task at ffffffff8b4e12d1
     #22 [ffffb79e03423dc8] move_task_group at ffffffff8b4eaa5b
     #23 [ffffb79e03423e00] cpulimit_balance_cpu_stop at ffffffff8b4f02f3
     #24 [ffffb79e03423eb0] cpu_stopper_thread at ffffffff8b576b67
     #25 [ffffb79e03423ee8] smpboot_thread_fn at ffffffff8b4d9125
     #26 [ffffb79e03423f10] kthread at ffffffff8b4d4fc2
     #27 [ffffb79e03423f50] ret_from_fork at ffffffff8be00255
    
    The printk called because assert_clock_updated() triggered
            SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
    
    This means that we missing necessary update_rq_clock() call.
    Add one to cpulimit_balance_cpu_stop() to fix the warning.
    Also add one in load_balance() before move_task_groups() call.
    It seems to be another place missing this call.
    
    https://jira.sw.ru/browse/PSBM-108013
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    +++
    kernel/sched/fair.c: Add more missing update_rq_clock() calls
    
    Add update_rq_clock() for 'target_rq' to avoid WARN() coming
    from attach_task(). Also add rq_repin_lock(busiest, &rf); in
    load_balance() for detach_task(). The update_rq_clock() isn't
    necessary since it was updated before, but we need the repin
    since rq lock was released after update.
    
    https://jira.sw.ru/browse/PSBM-108013
    
    Reported-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-133986
    
    See also:
    5cb9eaa3d ("sched: Wrap rq::lock access")
    36c5bdc43 ("sched/topology: Kill SD_LOAD_BALANCE")
    e669ac8ab ("sched: Remove checks against SD_LOAD_BALANCE")
    9818427c6 ("sched/debug: Make sd->flags sysctl read-only")
    
    (cherry picked from commit fbafc1d55798fb54805164bb79a99aba859b294d)
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 include/linux/sched.h          |  29 +++
 include/linux/sched/sysctl.h   |   5 +
 include/linux/sched/topology.h |   5 +
 init/Kconfig                   |   4 +
 kernel/sched/core.c            |  44 +++++
 kernel/sched/fair.c            | 396 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h           |  16 ++
 kernel/sysctl.c                |  19 ++
 8 files changed, 518 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31e9e41b9d9d..c91d4777aedd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -451,6 +451,9 @@ struct sched_statistics {
 	u64				nr_migrations_cold;
 	u64				nr_failed_migrations_affine;
 	u64				nr_failed_migrations_running;
+#ifdef CONFIG_CFS_CPULIMIT
+	u64				nr_failed_migrations_cpulimit;
+#endif
 	u64				nr_failed_migrations_hot;
 	u64				nr_forced_migrations;
 
@@ -471,6 +474,9 @@ struct sched_entity {
 	struct load_weight		load;
 	struct rb_node			run_node;
 	struct list_head		group_node;
+#ifdef CONFIG_CFS_CPULIMIT
+	struct list_head		cfs_rq_node;
+#endif
 	unsigned int			on_rq;
 
 	u64				exec_start;
@@ -2053,6 +2059,29 @@ static inline bool vcpu_is_preempted(int cpu)
 }
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index db2c0f34aaaf..b6adb2b82e52 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -99,4 +99,9 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 8f0f778b7c91..379fd57f665e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -118,6 +118,11 @@ struct sched_domain {
 	unsigned int alb_failed;
 	unsigned int alb_pushed;
 
+	/* cpulimit balancing */
+	unsigned int clb_count;
+	unsigned int clb_failed;
+	unsigned int clb_pushed;
+
 	/* SD_BALANCE_EXEC stats */
 	unsigned int sbe_count;
 	unsigned int sbe_balanced;
diff --git a/init/Kconfig b/init/Kconfig
index 564553afb251..157a015393ac 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -966,9 +966,13 @@ config FAIR_GROUP_SCHED
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 
+config CFS_CPULIMIT
+	bool
+
 config CFS_BANDWIDTH
 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
 	depends on FAIR_GROUP_SCHED
+	select CFS_CPULIMIT
 	default n
 	help
 	  This option allows users to define CPU bandwidth rates (limits) for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ebb6dd99b442..d824282e942b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -371,6 +371,47 @@ static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
  */
 int sysctl_sched_rt_runtime = 950000;
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+	unsigned int nr_cpus = 0;
+	unsigned int max_nr_cpus = num_online_cpus();
+
+	rcu_read_lock();
+	nr_cpus = task_group(p)->nr_cpus;
+	rcu_read_unlock();
+
+	if (!nr_cpus || nr_cpus > max_nr_cpus)
+		nr_cpus = max_nr_cpus;
+
+	return nr_cpus;
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	unsigned long rate, max_rate;
+
+	if (!sysctl_sched_cpulimit_scale_cpufreq)
+		return freq;
+
+	rcu_read_lock();
+	rate = task_group(current)->cpu_rate;
+	rcu_read_unlock();
+
+	max_rate = num_online_vcpus() * MAX_CPU_RATE;
+	if (!rate || rate >= max_rate)
+		return freq;
+
+	return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
 
 /*
  * Serialization rules:
@@ -9085,6 +9126,9 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
+#ifdef CONFIG_CFS_CPULIMIT
+	root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fb30663db2fe..c42ff00885c0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -134,6 +134,11 @@ int __weak arch_asym_cpu_priority(int cpu)
  * (default: 5 msec, units: microseconds)
  */
 unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+
+#endif
+
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
 #endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -470,6 +475,88 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_CFS_CPULIMIT
+static int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+	/* if we canceled delayed dec, there is no need to do inc */
+	if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+		atomic_inc(&cfs_rq->tg->nr_cpus_active);
+	cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+	if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+		postpone = 0;
+
+	if (!postpone) {
+		cfs_rq->active = 0;
+		atomic_dec(&cfs_rq->tg->nr_cpus_active);
+	} else {
+		hrtimer_start_range_ns(&cfs_rq->active_timer,
+				ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+				HRTIMER_MODE_REL_PINNED);
+	}
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	struct cfs_rq *cfs_rq =
+		container_of(timer, struct cfs_rq, active_timer);
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long flags;
+
+	raw_spin_rq_lock_irqsave(rq, flags);
+	cfs_rq->active = !list_empty(&cfs_rq->tasks);
+	raw_spin_rq_unlock_irqrestore(rq, flags);
+
+	atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+	return HRTIMER_NORESTART;
+}
+
+static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+	int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+
+	nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
+		min_t(int, nr_cpus_limit, tg->nr_cpus) :
+		max_t(int, nr_cpus_limit, tg->nr_cpus);
+
+	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+		return 1;
+
+	if (nr_cpus_active > nr_cpus_limit)
+		return -1;
+
+	return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+
+static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	return 0;
+}
+
+static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+	return 1;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -2960,6 +3047,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		account_numa_enqueue(rq, task_of(se));
 		list_add(&se->group_node, &rq->cfs_tasks);
+#ifdef CONFIG_CFS_CPULIMIT
+		list_add(&se->cfs_rq_node, &cfs_rq->tasks);
+#endif
 	}
 #endif
 	cfs_rq->nr_running++;
@@ -2973,6 +3063,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
+#ifdef CONFIG_CFS_CPULIMIT
+		list_del(&se->cfs_rq_node);
+#endif
 	}
 #endif
 	cfs_rq->nr_running--;
@@ -4251,6 +4344,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
 	bool curr = cfs_rq->curr == se;
 
+	if (!cfs_rq->load.weight)
+		inc_nr_active_cfs_rqs(cfs_rq);
 	/*
 	 * If we're the current task, we must renormalise before calling
 	 * update_curr().
@@ -4408,6 +4503,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
 		update_min_vruntime(cfs_rq);
+
+	if (!cfs_rq->load.weight)
+		dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -5332,6 +5430,10 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_CFS_CPULIMIT
+	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5727,6 +5829,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 /* Working cpumask for: load_balance, load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+#ifdef CONFIG_CFS_CPULIMIT
+static DEFINE_PER_CPU(struct callback_head, cpulimit_cb_head);
+#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -6844,6 +6949,38 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	return target;
 }
 
+static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg;
+	struct sched_domain *sd;
+	int prev_cpu = task_cpu(p);
+	int cpu;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, *new_cpu) > 0)
+		return false;
+
+	if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+		return true;
+
+	if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+		*new_cpu = prev_cpu;
+		return true;
+	}
+
+	for_each_domain(*new_cpu, sd) {
+		for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				*new_cpu = cpu;
+				return true;
+			}
+		}
+	}
+#endif
+	return false;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6903,6 +7040,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 			break;
 	}
 
+	if (select_runnable_cpu(p, &new_cpu))
+		goto unlock;
+
 	if (unlikely(sd)) {
 		/* Slow path */
 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
@@ -6913,6 +7053,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		if (want_affine)
 			current->recent_used_cpu = cpu;
 	}
+unlock:
 	rcu_read_unlock();
 
 	return new_cpu;
@@ -7195,6 +7336,51 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
+static int cpulimit_balance_cpu_stop(void *data);
+
+static void trigger_cpulimit_balance(struct rq *this_rq)
+{
+	struct task_struct *p = this_rq->curr;
+	struct task_group *tg;
+	int this_cpu, cpu, target_cpu = -1;
+	struct sched_domain *sd;
+
+	this_cpu = cpu_of(this_rq);
+
+	if (!p->se.on_rq || this_rq->active_balance)
+		return;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, this_cpu) >= 0)
+		return;
+
+	rcu_read_lock();
+	for_each_domain(this_cpu, sd) {
+		for_each_cpu_and(cpu, sched_domain_span(sd),
+				 p->cpus_ptr) {
+			if (cpu != this_cpu &&
+			    cfs_rq_active(tg->cfs_rq[cpu])) {
+				target_cpu = cpu;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+
+	if (target_cpu >= 0) {
+		this_rq->active_balance = 1;
+		this_rq->push_cpu = target_cpu;
+		raw_spin_rq_unlock(this_rq);
+		stop_one_cpu_nowait(this_rq->cpu,
+				    cpulimit_balance_cpu_stop, this_rq,
+				    &this_rq->active_balance_work);
+		raw_spin_rq_lock(this_rq);
+	}
+}
+#endif
+
 struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -7282,6 +7468,9 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		set_next_entity(cfs_rq, se);
 	}
 
+#ifdef CONFIG_CFS_CPULIMIT
+	queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
+#endif
 	goto done;
 simple:
 #endif
@@ -7311,6 +7500,9 @@ done: __maybe_unused;
 
 	update_misfit_status(p, rq);
 
+#ifdef CONFIG_CFS_CPULIMIT
+	queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
+#endif
 	return p;
 
 idle:
@@ -7716,6 +7908,37 @@ static inline int migrate_degrades_locality(struct task_struct *p,
 }
 #endif
 
+static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+
+	if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
+		int cpu;
+
+		schedstat_inc(p->se.statistics.nr_failed_migrations_cpulimit);
+
+		env->flags |= LBF_SOME_PINNED;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0)
+			return 0;
+
+		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+			return 0;
+
+		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				env->flags |= LBF_DST_PINNED;
+				env->new_dst_cpu = cpu;
+				break;
+			}
+		}
+		return 0;
+	}
+#endif
+	return 1;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -7726,6 +7949,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
 	lockdep_assert_rq_held(env->src_rq);
 
+        if (!can_migrate_task_cpulimit(p, env))
+                return 0;
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -8087,6 +8312,161 @@ static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+static unsigned long entity_h_load(struct sched_entity *se);
+
+static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+
+	list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
+		p = task_of(se);
+		if (task_curr(p) ||
+		    !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
+			return 0;
+	}
+	env->flags &= ~LBF_ALL_PINNED;
+	return 1;
+}
+
+static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se, *tmp;
+	int moved = 0;
+
+	list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
+		struct task_struct *p = task_of(se);
+		detach_task(p, env);
+		attach_task(env->dst_rq, p);
+		moved++;
+	}
+	return moved;
+}
+
+static int move_task_groups(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq, *pos;
+	struct task_group *tg;
+	unsigned long load;
+	int cur_pulled, pulled = 0;
+
+	if (env->imbalance <= 0)
+		return 0;
+
+	for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/*
+		 * A child always goes before its parent in a leaf_cfs_rq_list.
+		 * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+		 * we could not migrate the child and therefore we should not
+		 * even try to migrate the parent.
+		 */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
+			continue;
+
+		load = entity_h_load(tg->se[env->src_cpu]);
+		if ((load / 2) > env->imbalance)
+			continue;
+
+		if (!can_migrate_task_group(cfs_rq, env))
+			continue;
+
+		cur_pulled = move_task_group(cfs_rq, env);
+		pulled += cur_pulled;
+		env->imbalance -= load;
+
+		env->loop += cur_pulled;
+		if (env->loop > env->loop_max)
+			break;
+
+		if (env->imbalance <= 0)
+			break;
+	}
+	return pulled;
+}
+
+static int do_cpulimit_balance(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq, *pos;
+	struct task_group *tg;
+	int pushed = 0;
+
+	for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/* see move_task_groups for why we skip such groups */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+		if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
+		    can_migrate_task_group(cfs_rq, env))
+			pushed += move_task_group(cfs_rq, env);
+	}
+	return pushed;
+}
+
+static int cpulimit_balance_cpu_stop(void *data)
+{
+	struct rq *rq = data;
+	int cpu = cpu_of(rq);
+	int target_cpu = rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct sched_domain *sd;
+
+	raw_spin_rq_lock_irq(rq);
+
+	if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
+		     !cpu_online(target_cpu)))
+		goto out_unlock;
+
+	if (unlikely(!rq->nr_running))
+		goto out_unlock;
+
+	BUG_ON(rq == target_rq);
+
+	double_lock_balance(rq, target_rq);
+	rcu_read_lock();
+	for_each_domain(target_cpu, sd) {
+		if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
+				break;
+	}
+	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= cpu,
+			.src_rq		= rq,
+		};
+
+		schedstat_inc(sd->clb_count);
+
+		update_rq_clock(rq);
+		update_rq_clock(target_rq);
+		if (do_cpulimit_balance(&env))
+			schedstat_inc(sd->clb_pushed);
+		else
+			schedstat_inc(sd->clb_failed);
+	}
+	rcu_read_unlock();
+	double_unlock_balance(rq, target_rq);
+
+out_unlock:
+	rq->active_balance = 0;
+	raw_spin_rq_unlock_irq(rq);
+	return 0;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static bool __update_blocked_others(struct rq *rq, bool *done)
 {
 	const struct sched_class *curr_class;
@@ -9812,6 +10192,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 		local_irq_restore(rf.flags);
 
+#ifdef CONFIG_CFS_CPULIMIT
+		if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
+			env.loop = 0;
+			local_irq_save(rf.flags);
+			double_rq_lock(env.dst_rq, busiest);
+			rq_repin_lock(env.src_rq, &rf);
+			update_rq_clock(env.dst_rq);
+			cur_ld_moved = ld_moved = move_task_groups(&env);
+			double_rq_unlock(env.dst_rq, busiest);
+			local_irq_restore(rf.flags);
+                }
+#endif
+
 		if (env.flags & LBF_NEED_BREAK) {
 			env.flags &= ~LBF_NEED_BREAK;
 			goto more_balance;
@@ -11251,6 +11644,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+#ifdef CONFIG_CFS_CPULIMIT
+	INIT_LIST_HEAD(&cfs_rq->tasks);
+#endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ed6e12e3eb65..9cddbc9920f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -433,6 +433,14 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+	unsigned long cpu_rate;
+	unsigned int nr_cpus;
+	atomic_t nr_cpus_active;
+	struct task_group *topmost_limited_ancestor; /* self if none of the
+							ancestors is limited */
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -540,6 +548,9 @@ struct cfs_rq {
 #endif
 
 	struct rb_root_cached	tasks_timeline;
+#ifdef CONFIG_CFS_CPULIMIT
+	struct list_head tasks;
+#endif
 
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
@@ -613,6 +624,10 @@ struct cfs_rq {
 	int			throttle_count;
 	struct list_head	throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_CFS_CPULIMIT
+	int active;
+	struct hrtimer active_timer;
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
@@ -2087,6 +2102,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_TASK_SLEEP	0x10
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 21d00e6954dd..5824d5dd2e1d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1866,6 +1866,25 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.procname	= "sched_vcpu_hotslice",
+		.data		= &sysctl_sched_vcpu_hotslice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "sched_cpulimit_scale_cpufreq",
+		.data		= &sysctl_sched_cpulimit_scale_cpufreq,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",