[Devel] [PATCH RHEL7] printk: Add touch_all_softlockup_watchdogs() to console_unlock()

Kirill Tkhai ktkhai at virtuozzo.com
Fri Jul 22 05:49:54 PDT 2016


Below is two stacks. The first one (PID: 0) is stack of paniced
soft-lockup task. The task is spinning in br_hello_timer_expired()
and is waiting for spin lock.

Spinlock is held by the second task (PID: 791031). Nothing serious
is made by him, the only is printk() under the spin lock.

The problem is in there is sysrq "show state" trigger is executed
on other cpu. It populates ring buffer very fast, while flushing
takes a lot of time:

time /bin/echo t > /proc/sysrq-trigger

real	0m48.320s
user	0m0.000s
sys	0m44.806s

Task with PID=791031 is owning console_sem, and it is flushing
the buffer. "Show state" finishes populating the ring buffer,
and does not flush the buffer, because it is not owning console_sem.
The buffer in this moment is about to be full. So,
touch_all_softlockup_watchdogs() from show_state_filter() is not
called anymore, and the buffer is flushing without touching soft dog.
This lead to soft lockup panic, observed in
https://jira.sw.ru/browse/PSBM-49893

The patch adds touching to soft dog on every cpu every
PREFIX_MAX + LOG_LINE_MAX (i.e. 1024) symbols printed
to prevent the panic described.

PID: 0      TASK: ffffffff8194d620  CPU: 0   COMMAND: "swapper/0"
 #0 [ffff88041fc03a98] machine_kexec at ffffffff8105249b
 #1 [ffff88041fc03af8] crash_kexec at ffffffff811032c2
 #2 [ffff88041fc03bc8] panic at ffffffff8162c7c0
 #3 [ffff88041fc03c48] watchdog_timer_fn at ffffffff8112c426
 #4 [ffff88041fc03c80] __hrtimer_run_queues at ffffffff810ab902
 #5 [ffff88041fc03cd8] hrtimer_interrupt at ffffffff810abea0
 #6 [ffff88041fc03d28] local_apic_timer_interrupt at ffffffff81049de7
 #7 [ffff88041fc03d40] smp_apic_timer_interrupt at ffffffff816452cf
 #8 [ffff88041fc03d58] apic_timer_interrupt at ffffffff8164399d
 #9 [ffff88041fc03e00] br_hello_timer_expired at ffffffffa04c141e [bridge]
#10 [ffff88041fc03e18] call_timer_fn at ffffffff8108bde6
#11 [ffff88041fc03e50] run_timer_softirq at ffffffff8108ead7
#12 [ffff88041fc03ec8] __do_softirq at ffffffff81085377
#13 [ffff88041fc03f48] call_softirq at ffffffff8164465c
#14 [ffff88041fc03f60] do_softirq at ffffffff81016f05
#15 [ffff88041fc03f80] irq_exit at ffffffff81085725
#16 [ffff88041fc03f98] smp_apic_timer_interrupt at ffffffff816452d5
#17 [ffff88041fc03fb0] apic_timer_interrupt at ffffffff8164399d

PID: 791031  TASK: ffff880403e6bf70  CPU: 1   COMMAND: "kworker/u16:10"
 #0 [ffff88041fc45e70] crash_nmi_callback at ffffffff810461d2
 #1 [ffff88041fc45e80] nmi_handle at ffffffff8163b739
 #2 [ffff88041fc45ec8] do_nmi at ffffffff8163b8e9
 #3 [ffff88041fc45ef0] end_repeat_nmi at ffffffff8163ab13
    [exception RIP: io_serial_in+21]
    RIP: ffffffff813d3805  RSP: ffff8802804e7990  RFLAGS: 00000002
    RAX: 0000000000000600  RBX: ffffffff8232bca0  RCX: 0000000000000000
    RDX: 00000000000003fd  RSI: 00000000000003fd  RDI: ffffffff8232bca0
    RBP: ffff8802804e7990   R8: 0000000000000000   R9: ffff880403d94ec0
    R10: 0000000000002800  R11: ffffc9001253fff8  R12: 0000000000002700
    R13: 0000000000000020  R14: ffffffff81cab9a4  R15: 0000000000000044
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #4 [ffff8802804e7990] io_serial_in at ffffffff813d3805
 #5 [ffff8802804e7998] wait_for_xmitr at ffffffff813d438b
 #6 [ffff8802804e79c0] serial8250_console_putchar at ffffffff813d454c
 #7 [ffff8802804e79e0] uart_console_write at ffffffff813cefea
 #8 [ffff8802804e7a10] serial8250_console_write at ffffffff813d449e
 #9 [ffff8802804e7a58] call_console_drivers.constprop.20 at ffffffff8107c721
#10 [ffff8802804e7a80] console_unlock at ffffffff8107d722
#11 [ffff8802804e7ab8] __vprintk_emit at ffffffff8107dc15
#12 [ffff8802804e7b38] printk at ffffffff8162c9fa
#13 [ffff8802804e7ba0] br_log_state at ffffffffa04bf432 [bridge]
#14 [ffff8802804e7bb0] br_stp_disable_port at ffffffffa04c0a57 [bridge]
#15 [ffff8802804e7be8] br_device_event at ffffffffa04bf138 [bridge]
#16 [ffff8802804e7c18] notifier_call_chain at ffffffff8163e70c
#17 [ffff8802804e7c50] raw_notifier_call_chain at ffffffff810ad726
#18 [ffff8802804e7c60] call_netdevice_notifiers at ffffffff8152b4bd
#19 [ffff8802804e7c80] dev_close_many at ffffffff8152b76c
#20 [ffff8802804e7cc8] rollback_registered_many at ffffffff8152cd18
#21 [ffff8802804e7cf0] unregister_netdevice_many at ffffffff8152cf29
#22 [ffff8802804e7d10] default_device_exit_batch at ffffffff8152e5b1
#23 [ffff8802804e7d88] ops_exit_list at ffffffff81527663
#24 [ffff8802804e7db8] cleanup_net at ffffffff81528740
#25 [ffff8802804e7e20] process_one_work at ffffffff8109ecfb
#26 [ffff8802804e7e68] worker_thread at ffffffff8109facb
#27 [ffff8802804e7ec8] kthread at ffffffff810a754f
#28 [ffff8802804e7f50] ret_from_fork at ffffffff81642c98

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 kernel/printk.c |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/printk.c b/kernel/printk.c
index 91766fc..cf04d83 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2244,6 +2244,7 @@ void console_unlock(void)
 	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	bool wake_klogd = false;
+	bool first = true;
 	bool retry;
 
 	if (console_suspended) {
@@ -2261,6 +2262,11 @@ void console_unlock(void)
 		size_t len;
 		int level;
 
+		if (first)
+			first = false;
+		else
+			touch_all_softlockup_watchdogs();
+
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		if (log->seen_seq != log->next_seq) {
 			wake_klogd = true;



More information about the Devel mailing list