[Devel] [PATCH RHEL7 COMMIT] Revert "ve/sched: port boosting hacks against prio inversion"

Fri Jan 17 12:34:48 MSK 2020

The commit is pushed to "branch-rh7-3.10.0-1062.7.1.vz7.130.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1062.7.1.vz7.130.10
------>
commit 28a9251d7129c280d500734f9303e5af18c6c4ae
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Fri Jan 17 12:34:45 2020 +0300

    Revert "ve/sched: port boosting hacks against prio inversion"
    
    This reverts commit 6a052cbf8aefba3deaba6cf93072c0810f38de92.
    
    Boosting hacks doesn't work well with the optimized pick_next_task_fair()
    from the commit 678d5718d8d0 ("sched/fair: Optimize cgroup
    pick_next_task_fair()").
    
    And actually causes performance degradation, so lets remove it.
    
    https://jira.sw.ru/browse/PSBM-100188
    
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 include/linux/sched.h   |   6 --
 kernel/sched/core.c     |  14 +---
 kernel/sched/fair.c     | 186 ++----------------------------------------------
 kernel/sched/features.h |   3 -
 kernel/sched/sched.h    |   6 +-
 5 files changed, 6 insertions(+), 209 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07f9954483b80..f0d976c3800a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1258,11 +1258,6 @@ struct sched_entity {
 	struct list_head	cfs_rq_node;
 	unsigned int		on_rq;
 
-#ifdef CONFIG_CFS_BANDWIDTH
-	unsigned int            boosted;
-	struct list_head        boost_node;
-#endif
-
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
@@ -1496,7 +1491,6 @@ struct task_struct {
 	unsigned sched_contributes_to_load:1;
 	RH_KABI_FILL_HOLE(unsigned sched_remote_wakeup:1)
 	unsigned sched_interruptible_sleep:1;
-	unsigned woken_while_running:1;
 	unsigned sched_iothrottled_sleep:1;
 	unsigned :0; /* force alignment to the next boundary */
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f7f76cbe6f271..f264c209b1e77 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1776,7 +1776,6 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	rq = __task_rq_lock(p);
 	if (task_on_rq_queued(p)) {
 		ttwu_do_wakeup(rq, p, wake_flags);
-		p->woken_while_running = 1;
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
@@ -2102,10 +2101,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
-#ifdef CONFIG_CFS_BANDWIDTH
-	p->se.boosted = 0;
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 	/* Even if schedstat is disabled, there should not be garbage */
 	p->se.statistics = &p->statistics;
@@ -3789,7 +3784,6 @@ static void __sched __schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int resched_next;
 	int cpu;
 
 need_resched:
@@ -3880,14 +3874,8 @@ static void __sched __schedule(void)
 
 	balance_callback(rq);
 
-	resched_next = READ_ONCE(rq->resched_next);
-	if (resched_next) {
-		set_tsk_need_resched(current);
-		rq->resched_next = 0;
-	}
-
 	sched_preempt_enable_no_resched();
-	if (!resched_next && need_resched())
+	if (need_resched())
 		goto need_resched;
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0e7ab2050af04..0d874caab7bd0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -454,16 +454,6 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
-static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
-{
-	return !list_empty(&cfs_rq->boosted_entities);
-}
-
-static inline int entity_boosted(struct sched_entity *se)
-{
-	return se->boosted;
-}
-
 #else /* !CONFIG_CFS_BANDWIDTH */
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -471,16 +461,6 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return 0;
 }
 
-static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
-{
-	return 0;
-}
-
-static inline int entity_boosted(struct sched_entity *se)
-{
-	return 0;
-}
-
 #endif /* CONFIG_CFS_BANDWIDTH */
 
 #ifdef CONFIG_CFS_CPULIMIT
@@ -994,100 +974,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 
-#ifdef CONFIG_CFS_BANDWIDTH
-static inline void update_entity_boost(struct sched_entity *se)
-{
-	if (!entity_is_task(se))
-		se->boosted = cfs_rq_has_boosted_entities(group_cfs_rq(se));
-	else {
-		struct task_struct *p = task_of(se);
-
-		if (unlikely(p != current))
-			return;
-
-		if (!(preempt_count() & PREEMPT_ACTIVE)) {
-			se->boosted = sched_feat(BOOST_WAKEUPS) &&
-					p->woken_while_running;
-			p->woken_while_running = 0;
-		} else
-			se->boosted = sched_feat(BOOST_PREEMPT) &&
-				      !p->may_throttle;
-	}
-}
-
-static int check_enqueue_boost(struct rq *rq, struct task_struct *p, int flags)
-{
-	if (sched_feat(BOOST_WAKEUPS) && (flags & ENQUEUE_WAKEUP))
-		p->se.boosted = 1;
-	return p->se.boosted;
-}
-
-static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
-					    struct sched_entity *se)
-{
-	list_add(&se->boost_node, &cfs_rq->boosted_entities);
-}
-
-static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
-					    struct sched_entity *se)
-{
-	list_del(&se->boost_node);
-}
-
-static int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
-				  struct sched_entity *se)
-{
-	if (entity_is_task(se) || !entity_boosted(se)) {
-		if (se != cfs_rq->curr)
-			__enqueue_boosted_entity(cfs_rq, se);
-		se->boosted = 1;
-		return 1;
-	}
-
-	return 0;
-}
-
-static int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
-				  struct sched_entity *se)
-{
-	if (entity_is_task(se) ||
-	    !cfs_rq_has_boosted_entities(group_cfs_rq(se))) {
-		if (se != cfs_rq->curr)
-			__dequeue_boosted_entity(cfs_rq, se);
-		if (!entity_is_task(se))
-			se->boosted = 0;
-		return 1;
-	}
-
-	return 0;
-}
-#else
-static inline void update_entity_boost(struct sched_entity *se) {}
-
-static inline int check_enqueue_boost(struct rq *rq,
-				      struct task_struct *p, int flags)
-{
-	return 0;
-}
-
-static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
-					    struct sched_entity *se) {}
-static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
-					    struct sched_entity *se) {}
-
-static inline int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
-					 struct sched_entity *se)
-{
-	return 0;
-}
-
-static inline int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
-					 struct sched_entity *se)
-{
-	return 0;
-}
-#endif
-
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -3105,7 +2991,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags);
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
 static inline void check_schedstat_required(void)
 {
@@ -3206,7 +3092,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
-		check_enqueue_throttle(cfs_rq, flags);
+		check_enqueue_throttle(cfs_rq);
 	}
 }
 
@@ -3361,8 +3247,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		if (schedstat_enabled())
 			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		if (entity_boosted(se))
-			__dequeue_boosted_entity(cfs_rq, se);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -3439,20 +3323,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 		se = cfs_rq->next;
 
-#ifdef CONFIG_CFS_BANDWIDTH
-	/*
-	 * Give boosted tasks a chance to finish their kernel-mode execution in
-	 * order to avoid prio inversion in case they hold a lock, but resched
-	 * them asap for the sake of fairness.
-	 */
-	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) {
-		if (cfs_rq_has_boosted_entities(cfs_rq))
-			se = list_first_entry(&cfs_rq->boosted_entities,
-					      struct sched_entity, boost_node);
-		rq_of(cfs_rq)->resched_next = 1;
-	}
-#endif
-
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -3469,14 +3339,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	update_entity_boost(prev);
-	if (entity_boosted(prev) && prev->on_rq) {
-		__enqueue_boosted_entity(cfs_rq, prev);
-		if (unlikely(cfs_rq_throttled(cfs_rq)))
-			/* prev was moved to throttled cfs_rq */
-			unthrottle_cfs_rq(cfs_rq);
-	}
-
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
@@ -4083,7 +3945,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
  * runtime as update_curr() throttling can not not trigger until it's on-rq.
  */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags)
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
 		return;
@@ -4108,9 +3970,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags)
 		}
 	}
 
-	if (flags & ENQUEUE_BOOST)
-		return;
-
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -4158,9 +4017,6 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_rq_throttled(cfs_rq))
 		return true;
 
-	if (cfs_rq_has_boosted_entities(cfs_rq))
-		return false;
-
 	throttle_cfs_rq(cfs_rq);
 	return true;
 }
@@ -4247,7 +4103,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
-	INIT_LIST_HEAD(&cfs_rq->boosted_entities);
 #ifdef CONFIG_CFS_CPULIMIT
 	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_rq->active_timer.function = sched_cfs_active_timer;
@@ -4310,7 +4165,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
@@ -4402,14 +4257,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int boost = check_enqueue_boost(rq, p, flags);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		if (boost)
-			flags |= ENQUEUE_BOOST;
 		enqueue_entity(cfs_rq, se, flags);
 
 		/*
@@ -4422,9 +4274,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running++;
 
-		if (boost)
-			boost = enqueue_boosted_entity(cfs_rq, se);
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -4435,9 +4284,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		if (boost)
-			boost = enqueue_boosted_entity(cfs_rq, se);
-
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -4445,14 +4291,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se) {
 		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
-	} else if (boost) {
-		for_each_sched_entity(se) {
-			cfs_rq = cfs_rq_of(se);
-			if (!enqueue_boosted_entity(cfs_rq, se))
-				break;
-			if (cfs_rq_throttled(cfs_rq))
-				unthrottle_cfs_rq(cfs_rq);
-		}
 	}
 	hrtick_update(rq);
 }
@@ -4468,7 +4306,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
 	if (task_sleep)
@@ -4488,9 +4325,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running--;
 
-		if (boosted)
-			boosted = dequeue_boosted_entity(cfs_rq, se);
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -4513,9 +4347,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		if (boosted)
-			boosted = dequeue_boosted_entity(cfs_rq, se);
-
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -5493,15 +5324,6 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
-	if (rq->resched_next && !entity_boosted(&p->se)) {
-		/*
-		 * seems boosted tasks have gone from the throttled cfs_rq,
-		 * pick another task then
-		 */
-		resched_curr(rq);
-		rq->resched_next = 0;
-	}
-
 	return p;
 
 idle:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5694d74b7bcf9..57497b6d0dfb5 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -82,6 +82,3 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  */
 SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
-
-SCHED_FEAT(BOOST_WAKEUPS, true)
-SCHED_FEAT(BOOST_PREEMPT, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e8d099b7cb284..86181a147b941 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,8 +477,6 @@ struct cfs_rq {
 	u64 throttled_clock_task_time;
 	int throttled, throttle_count, throttle_uptodate;
 	struct list_head throttled_list;
-
-	struct list_head boosted_entities;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #ifdef CONFIG_SMP
 	RH_KABI_EXTEND(u64 last_h_load_update)
@@ -678,8 +676,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
-	signed char skip_clock_update;
-	unsigned char resched_next;
+	int skip_clock_update;
 
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
@@ -1313,7 +1310,6 @@ static const u32 prio_to_wmult[40] = {
 #endif
 #define ENQUEUE_HEAD		0x08
 #define ENQUEUE_REPLENISH	0x10
-#define ENQUEUE_BOOST		0x20
 
 #define DEQUEUE_SLEEP		0x01
 #define DEQUEUE_SAVE		0x02