[Devel] [PATCH RHEL COMMIT] sched: Account cfs_rq::nr_iowait

Konstantin Khorenko khorenko at virtuozzo.com
Fri Sep 24 14:49:30 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 4c7d3e7c6653a16504ead52bd079e307ad7051d4
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Fri Sep 24 14:49:30 2021 +0300

    sched: Account cfs_rq::nr_iowait
    
    Extracted from "Initial patch".
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    +++
    sched: fix cfs_rq::nr_iowait accounting
    
    After recent RedHat (b6be9ae "rh7: import RHEL7 kernel-3.10.0-957.12.2.el7")
    following sequence:
    
      update_stats_dequeue()
        dequeue_sleeper()
          cfs_rq->nr_iowait++
    
    is called conditionally and cfs_rq::nr_iowait incremented if
    schedstat_enabled() is true.
    
    However, it is expected that this counter handled independently on
    other scheduler statistics gathering. To fix it, move cfs_rq::nr_iowait
    incrementing out of schedstat_enabled() checking.
    
    https://jira.sw.ru/browse/PSBM-93850
    Signed-off-by: Jan Dakinevich <jan.dakinevich at virtuozzo.com>
    
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Reviewed-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    khorenko@ note: after this patch "nr_iowait" should be accounted properly until
    disk io limits are set for a Container and throttling is activated. Taking into
    account at the moment "nr_iowait" is always broken, let's apply current patch
    and rework "nr_iowait" accounting to honor throttle code later.
    
    At the moment throttle_cfs_rq() will inc nr_iowait (in dequeue_entity()) while
    unthrottle_cfs_rq() won't decrement it in enqueue_entity().
    
    Changes when porting to VZ8:
    - Drop hunk in try_to_wake_up_local() as old code path:
      schedule
        __schedule
          try_to_wake_up_local
            nr_iowait_dec
    is now replaced by mainstream with:
      schedule
        sched_submit_work
          wq_worker_sleeping
            wake_up_process
              try_to_wake_up
                nr_iowait_dec
    and there is no more try_to_wake_up_local().
    - Replace removal hunk in dequeue_sleeper() with corresponding hunk in
    update_stats_dequeue.
    
    https://jira.sw.ru/browse/PSBM-127846
    (cherry-picked from vz7 commit 0bf288fedba7 ("sched: fix
    cfs_rq::nr_iowait accounting"))
    mFixes: ebd33cb22f39 ("sched: Account cfs_rq::nr_iowait")
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-133986
    
    task->state -> READ_ONCE(task->__state)
    
    (cherry picked from commit 30967ce528450629853dc71362fdd1aef21a3245)
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 kernel/sched/core.c  | 17 +++++++++++++++++
 kernel/sched/fair.c  | 25 +++++++++++++++++++++++++
 kernel/sched/sched.h |  3 +++
 3 files changed, 45 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7a1d5d09ade..c0c6a90ea32c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3854,6 +3854,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_cond_load_acquire(&p->on_cpu, !VAL);
 
+	if (p->in_iowait && p->sched_class->nr_iowait_dec) {
+		struct rq_flags rf;
+		struct rq *rq;
+
+		rq = __task_rq_lock(p, &rf);
+		p->sched_class->nr_iowait_dec(p);
+		__task_rq_unlock(rq, &rf);
+	}
+
 	cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
 	if (task_cpu(p) != cpu) {
 		if (p->in_iowait) {
@@ -9546,6 +9555,10 @@ void sched_move_task(struct task_struct *tsk)
 	if (queued)
 		dequeue_task(rq, tsk, queue_flags);
 	else {
+		if (!(READ_ONCE(tsk->__state) ==  TASK_WAKING) && tsk->in_iowait &&
+		    tsk->sched_class->nr_iowait_dec)
+			tsk->sched_class->nr_iowait_dec(tsk);
+
 		if (tsk->sched_contributes_to_load)
 			task_cfs_rq(tsk)->nr_unint--;
 
@@ -9560,6 +9573,10 @@ void sched_move_task(struct task_struct *tsk)
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
 	else {
+		if (!(READ_ONCE(tsk->__state) ==  TASK_WAKING) && tsk->in_iowait &&
+		    tsk->sched_class->nr_iowait_inc)
+			tsk->sched_class->nr_iowait_inc(tsk);
+
 		if (tsk->sched_contributes_to_load)
 			task_cfs_rq(tsk)->nr_unint++;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..fb32b3480e19 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4372,6 +4372,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_stats_dequeue(cfs_rq, se, flags);
 
+	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+		struct task_struct *tsk = task_of(se);
+
+		if (tsk->in_iowait)
+			cfs_rq->nr_iowait++;
+	}
+
 	clear_buddies(cfs_rq, se);
 
 	if (se != cfs_rq->curr)
@@ -11483,6 +11490,22 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void nr_iowait_dec_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	cfs_rq->nr_iowait--;
+}
+
+static void nr_iowait_inc_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	cfs_rq->nr_iowait++;
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /*
  * All the scheduling class methods:
  */
@@ -11525,6 +11548,8 @@ DEFINE_SCHED_CLASS(fair) = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_change_group	= task_change_group_fair,
+	.nr_iowait_inc          = nr_iowait_inc_fair,
+	.nr_iowait_dec          = nr_iowait_dec_fair,
 #endif
 
 #ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 649210b93e11..ed6e12e3eb65 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -525,6 +525,7 @@ struct cfs_rq {
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
+	unsigned int nr_iowait;
 	unsigned int nr_unint;
 
 	u64			exec_clock;
@@ -2165,6 +2166,8 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_change_group)(struct task_struct *p, int type);
 #endif
+	void (*nr_iowait_inc) (struct task_struct *p);
+	void (*nr_iowait_dec) (struct task_struct *p);
 };
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)


More information about the Devel mailing list