[Devel] [PATCH vz10 v2 1/2] sched: Clean up vCPU handling logic

Konstantin Khorenko khorenko at virtuozzo.com
Wed Mar 18 17:07:36 MSK 2026


kernel/sched/sched.h lines 778-781

   #ifdef CONFIG_CFS_CPULIMIT
       int active;
       struct hrtimer active_timer;
   #endif /* CONFIG_CFS_CPULIMIT */

to drop as well?



On 3/17/26 09:33, Dmitry Sepp wrote:
> The idea behind the change is to transition from the existing spatial
> vCPU handling approach that introduces costly modification to the
> scheduling logic to ensure the requested CPU count is obeyed (10%+
> performance drop in some tests) to temporal isolation that can be
> provided by the cgroup2 cpu.max.
> 
> Drop the legacy unneeded vCPU handling code. Remove the 'cpu.rate'
> control in favor of the internal calculation based on 'quota' and
> 'period' from 'cpu.max'. As 'cpu.max' is not implicitly used to set the
> rate, do not override nr_cpus when handling writes to 'cpu.max'.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-124385
> 
> Signed-off-by: Dmitry Sepp <dmitry.sepp at virtuozzo.com>
> ---
>   include/linux/sched.h          |   6 -
>   include/linux/sched/topology.h |   5 -
>   kernel/sched/core.c            |  89 +------
>   kernel/sched/fair.c            | 408 ---------------------------------
>   kernel/sched/sched.h           |   6 -
>   5 files changed, 3 insertions(+), 511 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f7892c449d2..493073a97f02 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -527,9 +527,6 @@ struct sched_statistics {
>   	u64				nr_migrations_cold;
>   	u64				nr_failed_migrations_affine;
>   	u64				nr_failed_migrations_running;
> -#ifdef CONFIG_CFS_CPULIMIT
> -	u64				nr_failed_migrations_cpulimit;
> -#endif
>   	u64				nr_failed_migrations_hot;
>   	u64				nr_forced_migrations;
>   
> @@ -558,9 +555,6 @@ struct sched_entity {
>   	u64				min_slice;
>   
>   	struct list_head		group_node;
> -#ifdef CONFIG_CFS_CPULIMIT
> -	struct list_head		cfs_rq_node;
> -#endif
>   	unsigned char			on_rq;
>   	unsigned char			sched_delayed;
>   	unsigned char			rel_deadline;
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 1f13b26efef5..4237daa5ac7a 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -125,11 +125,6 @@ struct sched_domain {
>   	unsigned int alb_failed;
>   	unsigned int alb_pushed;
>   
> -	/* cpulimit balancing */
> -	unsigned int clb_count;
> -	unsigned int clb_failed;
> -	unsigned int clb_pushed;
> -
>   	/* SD_BALANCE_EXEC stats */
>   	unsigned int sbe_count;
>   	unsigned int sbe_balanced;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 28fb5d0ecd89..f66ee9d07387 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8710,9 +8710,6 @@ void __init sched_init(void)
>   	INIT_LIST_HEAD(&root_task_group.children);
>   	INIT_LIST_HEAD(&root_task_group.siblings);
>   	autogroup_init(&init_task);
> -#ifdef CONFIG_CFS_CPULIMIT
> -	root_task_group.topmost_limited_ancestor = &root_task_group;
> -#endif
>   #endif /* CONFIG_CGROUP_SCHED */
>   
>   	for_each_possible_cpu(i) {
> @@ -9149,8 +9146,6 @@ struct task_group *sched_create_group(struct task_group *parent)
>   	return ERR_PTR(-ENOMEM);
>   }
>   
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg);
> -
>   void sched_online_group(struct task_group *tg, struct task_group *parent)
>   {
>   	unsigned long flags;
> @@ -9164,9 +9159,6 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
>   	tg->parent = parent;
>   	INIT_LIST_HEAD(&tg->children);
>   	list_add_rcu(&tg->siblings, &parent->children);
> -#ifdef CONFIG_CFS_BANDWIDTH
> -	tg_update_topmost_limited_ancestor(tg);
> -#endif
>   	spin_unlock_irqrestore(&task_group_lock, flags);
>   
>   	online_fair_sched_group(tg);
> @@ -9650,7 +9642,6 @@ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
>   static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>   
>   static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
> -static void tg_limit_toggled(struct task_group *tg);
>   
>   static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
>   				u64 burst)
> @@ -9730,10 +9721,6 @@ static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
>   		if (cfs_rq->throttled)
>   			unthrottle_cfs_rq(cfs_rq);
>   	}
> -
> -	if (runtime_enabled != runtime_was_enabled)
> -		tg_limit_toggled(tg);
> -
>   	if (runtime_was_enabled && !runtime_enabled)
>   		cfs_bandwidth_usage_dec();
>   
> @@ -10002,49 +9989,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
>   }
>   
>   #ifdef CONFIG_CFS_CPULIMIT
> -static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void *unused)
> -{
> -	struct task_group *parent = tg->parent;
> -
> -	/*
> -	 * Parent and none of its uncestors is limited? The task group should
> -	 * become a topmost limited uncestor then, provided it has a limit set.
> -	 * Otherwise inherit topmost limited ancestor from the parent.
> -	 */
> -	if (parent->topmost_limited_ancestor == parent &&
> -	    parent->cfs_bandwidth.quota == RUNTIME_INF)
> -		tg->topmost_limited_ancestor = tg;
> -	else
> -		tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
> -	return 0;
> -}
> -
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> -	__tg_update_topmost_limited_ancestor(tg, NULL);
> -}
> -
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> -	if (tg->topmost_limited_ancestor != tg) {
> -		/*
> -		 * This task group is not a topmost limited ancestor, so both
> -		 * it and all its children must already point to their topmost
> -		 * limited ancestor, and we have nothing to do.
> -		 */
> -		return;
> -	}
> -
> -	/*
> -	 * This task group is a topmost limited ancestor. Walk over all its
> -	 * children and update their pointers to the topmost limited ancestor.
> -	 */
> -
> -	spin_lock_irq(&task_group_lock);
> -	walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, NULL);
> -	spin_unlock_irq(&task_group_lock);
> -}
> -
>   static void tg_update_cpu_limit(struct task_group *tg)
>   {
>   	long quota, period;
> @@ -10059,14 +10003,13 @@ static void tg_update_cpu_limit(struct task_group *tg)
>   	}
>   
>   	tg->cpu_rate = rate;
> -	tg->nr_cpus = 0;
>   }
>   
> -static int tg_set_cpu_limit(struct task_group *tg,
> -			    unsigned long cpu_rate, unsigned int nr_cpus)
> +static int tg_set_cpu_limit(struct task_group *tg, unsigned int nr_cpus)
>   {
>   	int ret;
>   	unsigned long rate;
> +	unsigned long cpu_rate = tg->cpu_rate;
>   	u64 quota = RUNTIME_INF;
>   	u64 burst = tg_get_cfs_burst(tg);
>   	u64 period = default_cfs_period();
> @@ -10090,21 +10033,6 @@ static int tg_set_cpu_limit(struct task_group *tg,
>   	return ret;
>   }
>   
> -static u64 cpu_rate_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> -{
> -	return css_tg(css)->cpu_rate;
> -}
> -
> -static int cpu_rate_write_u64(struct cgroup_subsys_state *css,
> -			      struct cftype *cftype, u64 rate)
> -{
> -	struct task_group *tg = css_tg(css);
> -
> -	if (rate > num_online_cpus() * MAX_CPU_RATE)
> -		rate = num_online_cpus() * MAX_CPU_RATE;
> -	return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
> -}
> -
>   static u64 nr_cpus_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
>   {
>   	return css_tg(css)->nr_cpus;
> @@ -10117,15 +10045,9 @@ static int nr_cpus_write_u64(struct cgroup_subsys_state *css,
>   
>   	if (nr_cpus > num_online_cpus())
>   		nr_cpus = num_online_cpus();
> -	return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
> +	return tg_set_cpu_limit(tg, nr_cpus);
>   }
>   #else
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> -}
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> -}
>   static void tg_update_cpu_limit(struct task_group *tg)
>   {
>   }
> @@ -10257,11 +10179,6 @@ static struct cftype cpu_legacy_files[] = {
>   	},
>   #endif
>   #ifdef CONFIG_CFS_CPULIMIT
> -	{
> -		.name = "rate",
> -		.read_u64 = cpu_rate_read_u64,
> -		.write_u64 = cpu_rate_write_u64,
> -	},
>   	{
>   		.name = "nr_cpus",
>   		.read_u64 = nr_cpus_read_u64,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5879d9a99908..f8d9d9ac0e83 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -129,10 +129,6 @@ static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
>   static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
>   #endif
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
> -#endif
> -
>   #ifdef CONFIG_SYSCTL
>   static struct ctl_table sched_fair_sysctls[] = {
>   #ifdef CONFIG_CFS_BANDWIDTH
> @@ -155,16 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
>   		.extra1		= SYSCTL_ZERO,
>   	},
>   #endif /* CONFIG_NUMA_BALANCING */
> -#ifdef CONFIG_CFS_CPULIMIT
> -	{
> -		.procname	= "sched_vcpu_hotslice",
> -		.data		= &sysctl_sched_vcpu_hotslice,
> -		.maxlen		= sizeof(unsigned int),
> -		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> -		.extra1		= SYSCTL_ZERO,
> -	},
> -#endif
>   };
>   
>   static int __init sched_fair_sysctl_init(void)
> @@ -530,88 +516,6 @@ static int se_is_idle(struct sched_entity *se)
>   
>   #endif	/* CONFIG_FAIR_GROUP_SCHED */
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -static int cfs_rq_active(struct cfs_rq *cfs_rq)
> -{
> -	return cfs_rq->active;
> -}
> -
> -static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> -	/* if we canceled delayed dec, there is no need to do inc */
> -	if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
> -		atomic_inc(&cfs_rq->tg->nr_cpus_active);
> -	cfs_rq->active = 1;
> -}
> -
> -static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> -	if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
> -		postpone = 0;
> -
> -	if (!postpone) {
> -		cfs_rq->active = 0;
> -		atomic_dec(&cfs_rq->tg->nr_cpus_active);
> -	} else {
> -		hrtimer_start_range_ns(&cfs_rq->active_timer,
> -				ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
> -				HRTIMER_MODE_REL_PINNED);
> -	}
> -}
> -
> -static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
> -{
> -	struct cfs_rq *cfs_rq =
> -		container_of(timer, struct cfs_rq, active_timer);
> -	struct rq *rq = rq_of(cfs_rq);
> -	unsigned long flags;
> -
> -	raw_spin_rq_lock_irqsave(rq, flags);
> -	cfs_rq->active = !list_empty(&cfs_rq->tasks);
> -	raw_spin_rq_unlock_irqrestore(rq, flags);
> -
> -	atomic_dec(&cfs_rq->tg->nr_cpus_active);
> -
> -	return HRTIMER_NORESTART;
> -}
> -
> -static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
> -{
> -	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
> -	int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
> -
> -	nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
> -		min_t(int, nr_cpus_limit, tg->nr_cpus) :
> -		max_t(int, nr_cpus_limit, tg->nr_cpus);
> -
> -	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
> -		return 1;
> -
> -	if (nr_cpus_active > nr_cpus_limit)
> -		return -1;
> -
> -	return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
> -}
> -#else /* !CONFIG_CFS_CPULIMIT */
> -static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> -}
> -
> -static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> -}
> -
> -static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
> -{
> -	return 0;
> -}
> -
> -static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
> -{
> -	return 1;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
>   static __always_inline
>   void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
>   
> @@ -3771,9 +3675,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   
>   		account_numa_enqueue(rq, task_of(se));
>   		list_add(&se->group_node, &rq->cfs_tasks);
> -#ifdef CONFIG_CFS_CPULIMIT
> -		list_add(&se->cfs_rq_node, &cfs_rq->tasks);
> -#endif
>   	}
>   #endif
>   	cfs_rq->nr_running++;
> @@ -3789,9 +3690,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   	if (entity_is_task(se)) {
>   		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>   		list_del_init(&se->group_node);
> -#ifdef CONFIG_CFS_CPULIMIT
> -		list_del(&se->cfs_rq_node);
> -#endif
>   	}
>   #endif
>   	cfs_rq->nr_running--;
> @@ -5393,8 +5291,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>   {
>   	bool curr = cfs_rq->curr == se;
>   
> -	if (!cfs_rq->load.weight)
> -		inc_nr_active_cfs_rqs(cfs_rq);
>   	/*
>   	 * If we're the current task, we must renormalise before calling
>   	 * update_curr().
> @@ -5600,9 +5496,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>   	if (cfs_rq->nr_running == 0)
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
>   
> -	if (!cfs_rq->load.weight)
> -		dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
> -
>   	return true;
>   }
>   
> @@ -6648,10 +6541,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>   	cfs_rq->runtime_enabled = 0;
>   	INIT_LIST_HEAD(&cfs_rq->throttled_list);
>   	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> -#ifdef CONFIG_CFS_CPULIMIT
> -	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> -	cfs_rq->active_timer.function = sched_cfs_active_timer;
> -#endif
>   }
>   
>   void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> @@ -7294,9 +7183,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>   static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>   static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>   static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> -#ifdef CONFIG_CFS_CPULIMIT
> -static DEFINE_PER_CPU(struct balance_callback, cpulimit_cb_head);
> -#endif
>   
>   #ifdef CONFIG_NO_HZ_COMMON
>   
> @@ -8656,38 +8542,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>   	return target;
>   }
>   
> -static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> -	struct task_group *tg;
> -	struct sched_domain *sd;
> -	int prev_cpu = task_cpu(p);
> -	int cpu;
> -
> -	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -	if (check_cpulimit_spread(tg, *new_cpu) > 0)
> -		return false;
> -
> -	if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
> -		return true;
> -
> -	if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
> -		*new_cpu = prev_cpu;
> -		return true;
> -	}
> -
> -	for_each_domain(*new_cpu, sd) {
> -		for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
> -			if (cfs_rq_active(tg->cfs_rq[cpu])) {
> -				*new_cpu = cpu;
> -				return true;
> -			}
> -		}
> -	}
> -#endif
> -	return false;
> -}
> -
>   /*
>    * select_task_rq_fair: Select target runqueue for the waking task in domains
>    * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8756,9 +8610,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   			break;
>   	}
>   
> -	if (select_runnable_cpu(p, &new_cpu))
> -		goto unlock;
> -
>   	if (unlikely(sd)) {
>   		/* Slow path */
>   		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
> @@ -8766,7 +8617,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>   		/* Fast path */
>   		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>   	}
> -unlock:
>   	rcu_read_unlock();
>   
>   	return new_cpu;
> @@ -8992,51 +8842,6 @@ static struct task_struct *pick_task_fair(struct rq *rq)
>   static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
>   static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
>   
> -#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
> -static int cpulimit_balance_cpu_stop(void *data);
> -
> -static void trigger_cpulimit_balance(struct rq *this_rq)
> -{
> -	struct task_struct *p = this_rq->curr;
> -	struct task_group *tg;
> -	int this_cpu, cpu, target_cpu = -1;
> -	struct sched_domain *sd;
> -
> -	this_cpu = cpu_of(this_rq);
> -
> -	if (!p->se.on_rq || this_rq->active_balance)
> -		return;
> -
> -	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -	if (check_cpulimit_spread(tg, this_cpu) >= 0)
> -		return;
> -
> -	rcu_read_lock();
> -	for_each_domain(this_cpu, sd) {
> -		for_each_cpu_and(cpu, sched_domain_span(sd),
> -				 p->cpus_ptr) {
> -			if (cpu != this_cpu &&
> -			    cfs_rq_active(tg->cfs_rq[cpu])) {
> -				target_cpu = cpu;
> -				goto unlock;
> -			}
> -		}
> -	}
> -unlock:
> -	rcu_read_unlock();
> -
> -	if (target_cpu >= 0) {
> -		this_rq->active_balance = 1;
> -		this_rq->push_cpu = target_cpu;
> -		raw_spin_rq_unlock(this_rq);
> -		stop_one_cpu_nowait(this_rq->cpu,
> -				    cpulimit_balance_cpu_stop, this_rq,
> -				    &this_rq->active_balance_work);
> -		raw_spin_rq_lock(this_rq);
> -	}
> -}
> -#endif
> -
>   struct task_struct *
>   pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>   {
> @@ -9091,20 +8896,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>   		__set_next_task_fair(rq, p, true);
>   	}
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -	queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
> -#endif
> -
>   	return p;
>   
>   simple:
>   #endif
>   	put_prev_set_next_task(rq, prev, p);
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -	queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), trigger_cpulimit_balance);
> -#endif
> -
>   	return p;
>   
>   idle:
> @@ -9529,37 +9326,6 @@ static inline int migrate_degrades_locality(struct task_struct *p,
>   }
>   #endif
>   
> -static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> -	struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -
> -	if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
> -		int cpu;
> -
> -		schedstat_inc(p->stats.nr_failed_migrations_cpulimit);
> -
> -		env->flags |= LBF_SOME_PINNED;
> -
> -		if (check_cpulimit_spread(tg, env->src_cpu) != 0)
> -			return 0;
> -
> -		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
> -			return 0;
> -
> -		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
> -			if (cfs_rq_active(tg->cfs_rq[cpu])) {
> -				env->flags |= LBF_DST_PINNED;
> -				env->new_dst_cpu = cpu;
> -				break;
> -			}
> -		}
> -		return 0;
> -	}
> -#endif
> -	return 1;
> -}
> -
>   /*
>    * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
>    */
> @@ -9570,8 +9336,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
>   
>   	lockdep_assert_rq_held(env->src_rq);
>   
> -        if (!can_migrate_task_cpulimit(p, env))
> -                return 0;
>   	/*
>   	 * We do not migrate tasks that are:
>   	 * 1) throttled_lb_pair, or
> @@ -9935,161 +9699,6 @@ static inline void update_blocked_load_tick(struct rq *rq) {}
>   static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
>   #endif
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -static unsigned long entity_h_load(struct sched_entity *se);
> -
> -static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> -	struct sched_entity *se;
> -	struct task_struct *p;
> -
> -	list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
> -		p = task_of(se);
> -		if (task_curr(p) ||
> -		    !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
> -			return 0;
> -	}
> -	env->flags &= ~LBF_ALL_PINNED;
> -	return 1;
> -}
> -
> -static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> -	struct sched_entity *se, *tmp;
> -	int moved = 0;
> -
> -	list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
> -		struct task_struct *p = task_of(se);
> -		detach_task(p, env);
> -		attach_task(env->dst_rq, p);
> -		moved++;
> -	}
> -	return moved;
> -}
> -
> -static int move_task_groups(struct lb_env *env)
> -{
> -	struct cfs_rq *cfs_rq, *pos;
> -	struct task_group *tg;
> -	unsigned long load;
> -	int cur_pulled, pulled = 0;
> -
> -	if (env->imbalance <= 0)
> -		return 0;
> -
> -	for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> -		if (cfs_rq->tg == &root_task_group)
> -			continue;
> -		/*
> -		 * A child always goes before its parent in a leaf_cfs_rq_list.
> -		 * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
> -		 * we could not migrate the child and therefore we should not
> -		 * even try to migrate the parent.
> -		 */
> -		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> -			continue;
> -
> -		tg = cfs_rq->tg->topmost_limited_ancestor;
> -
> -		if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
> -		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
> -			continue;
> -
> -		load = entity_h_load(tg->se[env->src_cpu]);
> -		if ((load / 2) > env->imbalance)
> -			continue;
> -
> -		if (!can_migrate_task_group(cfs_rq, env))
> -			continue;
> -
> -		cur_pulled = move_task_group(cfs_rq, env);
> -		pulled += cur_pulled;
> -		env->imbalance -= load;
> -
> -		env->loop += cur_pulled;
> -		if (env->loop > env->loop_max)
> -			break;
> -
> -		if (env->imbalance <= 0)
> -			break;
> -	}
> -	return pulled;
> -}
> -
> -static int do_cpulimit_balance(struct lb_env *env)
> -{
> -	struct cfs_rq *cfs_rq, *pos;
> -	struct task_group *tg;
> -	int pushed = 0;
> -
> -	for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> -		if (cfs_rq->tg == &root_task_group)
> -			continue;
> -		/* see move_task_groups for why we skip such groups */
> -		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> -			continue;
> -		tg = cfs_rq->tg->topmost_limited_ancestor;
> -		if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
> -		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
> -		    can_migrate_task_group(cfs_rq, env))
> -			pushed += move_task_group(cfs_rq, env);
> -	}
> -	return pushed;
> -}
> -
> -static int cpulimit_balance_cpu_stop(void *data)
> -{
> -	struct rq *rq = data;
> -	int cpu = cpu_of(rq);
> -	int target_cpu = rq->push_cpu;
> -	struct rq *target_rq = cpu_rq(target_cpu);
> -	struct sched_domain *sd;
> -
> -	raw_spin_rq_lock_irq(rq);
> -
> -	if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
> -		     !cpu_online(target_cpu)))
> -		goto out_unlock;
> -
> -	if (unlikely(!rq->nr_running))
> -		goto out_unlock;
> -
> -	BUG_ON(rq == target_rq);
> -
> -	double_lock_balance(rq, target_rq);
> -	rcu_read_lock();
> -	for_each_domain(target_cpu, sd) {
> -		if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
> -				break;
> -	}
> -	if (likely(sd)) {
> -		struct lb_env env = {
> -			.sd		= sd,
> -			.dst_cpu	= target_cpu,
> -			.dst_rq		= target_rq,
> -			.src_cpu	= cpu,
> -			.src_rq		= rq,
> -		};
> -
> -		schedstat_inc(sd->clb_count);
> -
> -		update_rq_clock(rq);
> -		update_rq_clock(target_rq);
> -		if (do_cpulimit_balance(&env))
> -			schedstat_inc(sd->clb_pushed);
> -		else
> -			schedstat_inc(sd->clb_failed);
> -	}
> -	rcu_read_unlock();
> -	double_unlock_balance(rq, target_rq);
> -
> -out_unlock:
> -	rq->active_balance = 0;
> -	raw_spin_rq_unlock_irq(rq);
> -	return 0;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
>   static bool __update_blocked_others(struct rq *rq, bool *done)
>   {
>   	bool updated;
> @@ -12126,20 +11735,6 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
>   
>   		local_irq_restore(rf.flags);
>   
> -#ifdef CONFIG_CFS_CPULIMIT
> -		if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
> -			env.loop = 0;
> -			local_irq_save(rf.flags);
> -			double_rq_lock(env.dst_rq, busiest);
> -			rq_repin_lock(busiest, &rf);
> -			update_rq_clock(env.dst_rq);
> -			cur_ld_moved = ld_moved = move_task_groups(&env);
> -			rq_unpin_lock(busiest, &rf);
> -			double_rq_unlock(env.dst_rq, busiest);
> -			local_irq_restore(rf.flags);
> -                }
> -#endif
> -
>   		if (env.flags & LBF_NEED_BREAK) {
>   			env.flags &= ~LBF_NEED_BREAK;
>   			goto more_balance;
> @@ -13640,9 +13235,6 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>   void init_cfs_rq(struct cfs_rq *cfs_rq)
>   {
>   	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
> -#ifdef CONFIG_CFS_CPULIMIT
> -	INIT_LIST_HEAD(&cfs_rq->tasks);
> -#endif
>   	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
>   #ifdef CONFIG_SMP
>   	raw_spin_lock_init(&cfs_rq->removed.lock);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0d3ff6958199..3ee88efdae0c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -515,9 +515,6 @@ struct task_group {
>   #define MAX_CPU_RATE 1024
>   	unsigned long cpu_rate;
>   	unsigned int nr_cpus;
> -	atomic_t nr_cpus_active;
> -	struct task_group *topmost_limited_ancestor; /* self if none of the
> -							ancestors is limited */
>   #endif
>   };
>   
> @@ -696,9 +693,6 @@ struct cfs_rq {
>   #endif
>   
>   	struct rb_root_cached	tasks_timeline;
> -#ifdef CONFIG_CFS_CPULIMIT
> -	struct list_head tasks;
> -#endif
>   
>   	/*
>   	 * 'curr' points to currently running entity on this cfs_rq.



More information about the Devel mailing list