[Devel] [PATCH RHEL9 COMMIT] sched/ve: calc_load_ve -- use raw spinlock

Konstantin Khorenko khorenko at virtuozzo.com
Thu Oct 14 19:36:59 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.7
------>
commit 7e96372c17b5c4c7095678779a42729d28a043af
Author: Cyrill Gorcunov <gorcunov at gmail.com>
Date:   Thu Oct 14 19:36:59 2021 +0300

    sched/ve: calc_load_ve -- use raw spinlock
    
    The @load_ve_lock spinlock guards manipulations of @ve_root_list,
    same time the calc_load_ve() is executed from irq context which
    triggers "invalid context wait" bug
    
     | [    5.195868] =============================
     | [    5.195877] [ BUG: Invalid wait context ]
     | [    5.195887] 5.14.0.ovz9.10.1 #37 Tainted: G         C     X --------- ---
     | [    5.195902] -----------------------------
     | [    5.195911] swapper/0/0 is trying to lock:
     | [    5.196327] ffffffff872d8438 (load_ve_lock){....}-{3:3}, at: calc_load_ve+0x15/0x1c0
     | [    5.196742] other info that might help us debug this:
     | [    5.196807] context-{2:2}
     | [    5.196807] no locks held by swapper/0/0.
     | [    5.196807] stack backtrace:
     | [    5.196807] CPU: 0 PID: 0 Comm: swapper/0 ve: / Tainted: G         C     X --------- ---  5.14.0.ovz9.10.1 #37 10.1
     | [    5.196807] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-4.fc34 04/01/2014
     | [    5.196807] Call Trace:
     | [    5.196807]  <IRQ>
     | [    5.196807]  dump_stack_lvl+0x57/0x7d
     | [    5.196807]  __lock_acquire.cold+0x28b/0x2cd
     | [    5.196807]  ? __lock_acquire+0x3b1/0x1f20
     | [    5.196807]  lock_acquire+0xca/0x300
     | [    5.196807]  ? calc_load_ve+0x15/0x1c0
     | [    5.196807]  ? kvm_sched_clock_read+0x14/0x40
     | [    5.196807]  ? sched_clock_local+0xe/0x80
     | [    5.196807]  ? sched_clock_cpu+0xa5/0xc0
     | [    5.196807]  _raw_spin_lock+0x34/0x80
     | [    5.196807]  ? calc_load_ve+0x15/0x1c0
     | [    5.196807]  calc_load_ve+0x15/0x1c0
     | [    5.196807]  tick_do_update_jiffies64+0x115/0x150
     | [    5.196807]  tick_irq_enter+0x6c/0xe0
     | [    5.196807]  irq_enter_rcu+0x79/0x80
     | [    5.196807]  sysvec_apic_timer_interrupt+0x95/0xd0
     | [    5.196807]  </IRQ>
     | [    5.196807]  asm_sysvec_apic_timer_interrupt+0x12/0x20
     | [    5.196807] RIP: 0010:default_idle+0x10/0x20
     | [    5.196807] RSP: 0018:ffffffff87203ea8 EFLAGS: 00000202
     | [    5.196807] RAX: ffffffff86380df0 RBX: 0000000000000000 RCX: 0000000000000001
     | [    5.196807] RDX: 0000000000000000 RSI: ffffffff86ee3980 RDI: ffffffff86e1050e
     | [    5.196807] RBP: ffffffff87260a00 R08: 0000000000000001 R09: 0000000000000001
     | [    5.196807] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000
     | [    5.196807] R13: 0000000000000000 R14: ffffffff87260120 R15: 0000000000000000
     | [    5.196807]  ? mwait_idle+0x70/0x70
     | [    5.196807]  ? mwait_idle+0x70/0x70
     | [    5.196807]  default_idle_call+0x59/0x90
     | [    5.196807]  do_idle+0x217/0x2b0
     | [    5.196807]  cpu_startup_entry+0x19/0x20
     | [    5.196807]  start_kernel+0x997/0x9bc
     | [    5.196807]  ? copy_bootdata+0x18/0x55
     | [    5.196807]  secondary_startup_64_no_verify+0xc2/0xcb
    
    Note that the problem is rather coming from rt camp where splinlocks
    become sleepable thus can't be used in irq context (and for our kernel
    it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since
    we know that we're operating in irq context lets use raw spinlocks
    instead.
    
    Also I make unlock to happen earlier because there is no need to
    keep it once we've finished traversing the @ve_root_list list.
    
    https://jira.sw.ru/browse/PSBM-134756
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 kernel/sched/core.c    | 10 +++++-----
 kernel/sched/loadavg.c |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e96fab5b93ec..a15735d460b1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10036,18 +10036,18 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 
 #ifdef CONFIG_VE
 LIST_HEAD(ve_root_list);
-DEFINE_SPINLOCK(load_ve_lock);
+DEFINE_RAW_SPINLOCK(load_ve_lock);
 
 void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	unsigned long flags;
 
-	spin_lock_irqsave(&load_ve_lock, flags);
+	raw_spin_lock_irqsave(&load_ve_lock, flags);
 	BUG_ON(!(css->flags & CSS_ONLINE));
 	if (list_empty(&tg->ve_root_list))
 		list_add(&tg->ve_root_list, &ve_root_list);
-	spin_unlock_irqrestore(&load_ve_lock, flags);
+	raw_spin_unlock_irqrestore(&load_ve_lock, flags);
 }
 
 void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
@@ -10055,9 +10055,9 @@ void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
        struct task_group *tg = css_tg(css);
        unsigned long flags;
 
-       spin_lock_irqsave(&load_ve_lock, flags);
+       raw_spin_lock_irqsave(&load_ve_lock, flags);
        list_del_init(&tg->ve_root_list);
-       spin_unlock_irqrestore(&load_ve_lock, flags);
+       raw_spin_unlock_irqrestore(&load_ve_lock, flags);
 }
 #endif /* CONFIG_VE */
 
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 74cfc755c87d..8a5fa39b8b7b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -110,7 +110,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
 
 #ifdef CONFIG_VE
 extern struct list_head ve_root_list;
-extern spinlock_t load_ve_lock;
+extern raw_spinlock_t load_ve_lock;
 
 void calc_load_ve(void)
 {
@@ -122,7 +122,7 @@ void calc_load_ve(void)
 	 * This is called without jiffies_lock, and here we protect
 	 * against very rare parallel execution on two or more cpus.
 	 */
-	spin_lock(&load_ve_lock);
+	raw_spin_lock(&load_ve_lock);
 	list_for_each_entry(tg, &ve_root_list, ve_root_list) {
 		nr_active = 0;
 		for_each_possible_cpu(i) {
@@ -146,6 +146,7 @@ void calc_load_ve(void)
 		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
 		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
 	}
+	raw_spin_unlock(&load_ve_lock);
 
 	nr_unint = nr_uninterruptible() * FIXED_1;
 
@@ -154,7 +155,6 @@ void calc_load_ve(void)
 	calc_load(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
 	calc_load(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
 	write_seqcount_end(&kstat_glob.nr_unint_avg_seq);
-	spin_unlock(&load_ve_lock);
 }
 #endif /* CONFIG_VE */
 


More information about the Devel mailing list