[Devel] [PATCH RH7 v2] sysctl: panic only after softlockup_panic seconds of softlockups in raw
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Thu Sep 29 00:47:49 PDT 2016
sysctl kernel.softlockup_panic (integer >= 0)
0(default) - do not generate panics on softlockup
n > 0 - panic if cpu is in softlockup state for a period
of n seconds
For instance: If n == 1 sysctl will work same as mainstream one,
kernel will panic imediately on first softlockup, as kernel
should lock at least for (2*kernel.watchdog_thresh) seconds,
where kernel.watchdog_thresh >= 1, to be softlockuped.
If kernel.watchdog_thresh is 10(default) real softlockups will
come every 20-22 seconds as sample_period is (kernel.watchdog_thresh
/ 5.0) == 2.0. Assume softlockup_panic == 60, there can be situation
when first softlockup will be printed at time 0s, second at time 22s,
third at 44s, forth will come at 66s and finaly as 66 > 60 kernel will
panic.
Actually softlockups are not required to be realy consecuent(processor
can be unlocked during period) as we allow 4 * kernel.watchdog_thresh
between the begining of softlockup and begining of next consequent one.
So it mean that: panic can really happen in intervall between
kernel.softlockup_panic and (kernel.softlockup_panic +
4 * kernel.watchdog_thresh + sample_period). So in case of example
in question between 60 and 102 seconds. Bigger watchdog_thresh means
bigger error. To get locked time grep in dmesg: "BUG: continious
softlockup for".
https://jira.sw.ru/browse/PSBM-52199
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
kernel/sysctl.c | 3 ++-
kernel/watchdog.c | 27 ++++++++++++++++++++++++++-
2 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8f7bc3..40dfc98 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -123,6 +123,7 @@ EXPORT_SYMBOL(ve_allow_module_load);
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
static int neg_one = -1;
+static int int_max = INT_MAX;
#endif
static int zero;
@@ -910,7 +911,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
- .extra2 = &one,
+ .extra2 = &int_max,
},
#ifdef CONFIG_SMP
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ba61141..9addd5b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -54,6 +54,8 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn;
+static DEFINE_PER_CPU(unsigned long, first_softlockup_ts);
+static DEFINE_PER_CPU(unsigned long, prev_softlockup_ts);
/* boot commands */
/*
@@ -304,6 +306,23 @@ static void watchdog_interrupt_count(void)
static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
+static int need_panic_on_softlockup(void)
+{
+ unsigned long first_ts = __this_cpu_read(first_softlockup_ts);
+ unsigned long now = get_timestamp();
+ int slp = softlockup_panic;
+
+ if (!slp)
+ return 0;
+
+ if (time_after(now, first_ts + slp)) {
+ printk(KERN_EMERG "BUG: continious softlockup for %lus!\n",
+ now - first_ts);
+ return 1;
+ }
+ return 0;
+}
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -345,6 +364,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
duration = is_softlockup(touch_ts);
if (unlikely(duration)) {
+ unsigned long prev_ts = __this_cpu_read(prev_softlockup_ts);
+
+ if (!prev_ts || time_after(touch_ts, prev_ts + 2 * get_softlockup_thresh()))
+ __this_cpu_write(first_softlockup_ts, touch_ts);
+ __this_cpu_write(prev_softlockup_ts, touch_ts);
+
/*
* If a virtual machine is stopped by the host it can look to
* the watchdog like a soft lockup, check to see if the host
@@ -390,7 +415,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
- if (softlockup_panic)
+ if (need_panic_on_softlockup())
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
} else
--
2.5.5
More information about the Devel
mailing list