Browse Source

Blackfin: implement nmi_watchdog for SMP on BF561

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Graf Yang 15 years ago
parent
commit
60ffdb3654

+ 9 - 0
arch/blackfin/Kconfig.debug

@@ -238,6 +238,15 @@ config EARLY_PRINTK
 	  all of this lives in the init section and is thrown away after the
 	  all of this lives in the init section and is thrown away after the
 	  kernel boots completely.
 	  kernel boots completely.
 
 
+config NMI_WATCHDOG
+	bool "Enable NMI watchdog to help debugging lockup on SMP"
+	default n
+	depends on (SMP && !BFIN_SCRATCH_REG_RETN)
+	help
+	  If any CPU in the system does not execute the period local timer
+	  interrupt for more than 5 seconds, then the NMI handler dumps debug
+	  information. This information can be used to debug the lockup.
+
 config CPLB_INFO
 config CPLB_INFO
 	bool "Display the CPLB information"
 	bool "Display the CPLB information"
 	help
 	help

+ 4 - 0
arch/blackfin/include/asm/irq.h

@@ -38,4 +38,8 @@
 
 
 #include <asm-generic/irq.h>
 #include <asm-generic/irq.h>
 
 
+#ifdef CONFIG_NMI_WATCHDOG
+# define ARCH_HAS_NMI_WATCHDOG
+#endif
+
 #endif				/* _BFIN_IRQ_H_ */
 #endif				/* _BFIN_IRQ_H_ */

+ 12 - 0
arch/blackfin/include/asm/nmi.h

@@ -0,0 +1,12 @@
+/*
+ * Copyright 2010 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2
+ */
+
+#ifndef _BFIN_NMI_H_
+#define _BFIN_NMI_H_
+
+#include <linux/nmi.h>
+
+#endif

+ 1 - 0
arch/blackfin/include/asm/smp.h

@@ -22,6 +22,7 @@ extern char coreb_trampoline_start, coreb_trampoline_end;
 struct corelock_slot {
 struct corelock_slot {
 	int lock;
 	int lock;
 };
 };
+extern struct corelock_slot corelock;
 
 
 void smp_icache_flush_range_others(unsigned long start,
 void smp_icache_flush_range_others(unsigned long start,
 				   unsigned long end);
 				   unsigned long end);

+ 1 - 0
arch/blackfin/kernel/Makefile

@@ -25,6 +25,7 @@ obj-$(CONFIG_CPLB_INFO)              += cplbinfo.o
 obj-$(CONFIG_MODULES)                += module.o
 obj-$(CONFIG_MODULES)                += module.o
 obj-$(CONFIG_KGDB)                   += kgdb.o
 obj-$(CONFIG_KGDB)                   += kgdb.o
 obj-$(CONFIG_KGDB_TESTS)             += kgdb_test.o
 obj-$(CONFIG_KGDB_TESTS)             += kgdb_test.o
+obj-$(CONFIG_NMI_WATCHDOG)           += nmi.o
 obj-$(CONFIG_EARLY_PRINTK)           += early_printk.o
 obj-$(CONFIG_EARLY_PRINTK)           += early_printk.o
 obj-$(CONFIG_EARLY_PRINTK)           += shadow_console.o
 obj-$(CONFIG_EARLY_PRINTK)           += shadow_console.o
 obj-$(CONFIG_STACKTRACE)             += stacktrace.o
 obj-$(CONFIG_STACKTRACE)             += stacktrace.o

+ 313 - 0
arch/blackfin/kernel/nmi.c

@@ -0,0 +1,313 @@
+/*
+ * Blackfin nmi_watchdog Driver
+ *
+ * Originally based on bfin_wdt.c
+ * Copyright 2010-2010 Analog Devices Inc.
+ *		Graff Yang <graf.yang@analog.com>
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/bitops.h>
+#include <linux/hardirq.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/nmi.h>
+#include <linux/smp.h>
+#include <linux/timer.h>
+#include <asm/blackfin.h>
+#include <asm/atomic.h>
+#include <asm/cacheflush.h>
+
+/* Bit in WDOG_CTL that indicates watchdog has expired (WDR0) */
+#define WDOG_EXPIRED 0x8000
+
+/* Masks for WDEV field in WDOG_CTL register */
+#define ICTL_RESET   0x0
+#define ICTL_NMI     0x2
+#define ICTL_GPI     0x4
+#define ICTL_NONE    0x6
+#define ICTL_MASK    0x6
+
+/* Masks for WDEN field in WDOG_CTL register */
+#define WDEN_MASK    0x0FF0
+#define WDEN_ENABLE  0x0000
+#define WDEN_DISABLE 0x0AD0
+
+#define DRV_NAME "nmi-wdt"
+
+#define NMI_WDT_TIMEOUT 5          /* 5 seconds */
+#define NMI_CHECK_TIMEOUT (4 * HZ) /* 4 seconds in jiffies */
+static int nmi_wdt_cpu = 1;
+
+static unsigned int timeout = NMI_WDT_TIMEOUT;
+static int nmi_active;
+
+static unsigned short wdoga_ctl;
+static unsigned int wdoga_cnt;
+static struct corelock_slot saved_corelock;
+static atomic_t nmi_touched[NR_CPUS];
+static struct timer_list ntimer;
+
+enum {
+	COREA_ENTER_NMI = 0,
+	COREA_EXIT_NMI,
+	COREB_EXIT_NMI,
+
+	NMI_EVENT_NR,
+};
+static unsigned long nmi_event __attribute__ ((__section__(".l2.bss")));
+
+/* we are in nmi, non-atomic bit ops is safe */
+static inline void set_nmi_event(int event)
+{
+	__set_bit(event, &nmi_event);
+}
+
+static inline void wait_nmi_event(int event)
+{
+	while (!test_bit(event, &nmi_event))
+		barrier();
+	__clear_bit(event, &nmi_event);
+}
+
+static inline void send_corea_nmi(void)
+{
+	wdoga_ctl = bfin_read_WDOGA_CTL();
+	wdoga_cnt = bfin_read_WDOGA_CNT();
+
+	bfin_write_WDOGA_CTL(WDEN_DISABLE);
+	bfin_write_WDOGA_CNT(0);
+	bfin_write_WDOGA_CTL(WDEN_ENABLE | ICTL_NMI);
+}
+
+static inline void restore_corea_nmi(void)
+{
+	bfin_write_WDOGA_CTL(WDEN_DISABLE);
+	bfin_write_WDOGA_CTL(WDOG_EXPIRED | WDEN_DISABLE | ICTL_NONE);
+
+	bfin_write_WDOGA_CNT(wdoga_cnt);
+	bfin_write_WDOGA_CTL(wdoga_ctl);
+}
+
+static inline void save_corelock(void)
+{
+	saved_corelock = corelock;
+	corelock.lock = 0;
+}
+
+static inline void restore_corelock(void)
+{
+	corelock = saved_corelock;
+}
+
+
+static inline void nmi_wdt_keepalive(void)
+{
+	bfin_write_WDOGB_STAT(0);
+}
+
+static inline void nmi_wdt_stop(void)
+{
+	bfin_write_WDOGB_CTL(WDEN_DISABLE);
+}
+
+/* before calling this function, you must stop the WDT */
+static inline void nmi_wdt_clear(void)
+{
+	/* clear TRO bit, disable event generation */
+	bfin_write_WDOGB_CTL(WDOG_EXPIRED | WDEN_DISABLE | ICTL_NONE);
+}
+
+static inline void nmi_wdt_start(void)
+{
+	bfin_write_WDOGB_CTL(WDEN_ENABLE | ICTL_NMI);
+}
+
+static inline int nmi_wdt_running(void)
+{
+	return ((bfin_read_WDOGB_CTL() & WDEN_MASK) != WDEN_DISABLE);
+}
+
+static inline int nmi_wdt_set_timeout(unsigned long t)
+{
+	u32 cnt, max_t, sclk;
+	int run;
+
+	sclk = get_sclk();
+	max_t = -1 / sclk;
+	cnt = t * sclk;
+	if (t > max_t) {
+		pr_warning("NMI: timeout value is too large\n");
+		return -EINVAL;
+	}
+
+	run = nmi_wdt_running();
+	nmi_wdt_stop();
+	bfin_write_WDOGB_CNT(cnt);
+	if (run)
+		nmi_wdt_start();
+
+	timeout = t;
+
+	return 0;
+}
+
+int check_nmi_wdt_touched(void)
+{
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	cpumask_t mask = cpu_online_map;
+
+	if (!atomic_read(&nmi_touched[this_cpu]))
+		return 0;
+
+	atomic_set(&nmi_touched[this_cpu], 0);
+
+	cpu_clear(this_cpu, mask);
+	for_each_cpu_mask(cpu, mask) {
+		invalidate_dcache_range((unsigned long)(&nmi_touched[cpu]),
+				(unsigned long)(&nmi_touched[cpu]));
+		if (!atomic_read(&nmi_touched[cpu]))
+			return 0;
+		atomic_set(&nmi_touched[cpu], 0);
+	}
+
+	return 1;
+}
+
+static void nmi_wdt_timer(unsigned long data)
+{
+	if (check_nmi_wdt_touched())
+		nmi_wdt_keepalive();
+
+	mod_timer(&ntimer, jiffies + NMI_CHECK_TIMEOUT);
+}
+
+static int __init init_nmi_wdt(void)
+{
+	nmi_wdt_set_timeout(timeout);
+	nmi_wdt_start();
+	nmi_active = true;
+
+	init_timer(&ntimer);
+	ntimer.function = nmi_wdt_timer;
+	ntimer.expires = jiffies + NMI_CHECK_TIMEOUT;
+	add_timer(&ntimer);
+
+	pr_info("nmi_wdt: initialized: timeout=%d sec\n", timeout);
+	return 0;
+}
+device_initcall(init_nmi_wdt);
+
+void touch_nmi_watchdog(void)
+{
+	atomic_set(&nmi_touched[smp_processor_id()], 1);
+}
+
+/* Suspend/resume support */
+#ifdef CONFIG_PM
+static int nmi_wdt_suspend(struct sys_device *dev, pm_message_t state)
+{
+	nmi_wdt_stop();
+	return 0;
+}
+
+static int nmi_wdt_resume(struct sys_device *dev)
+{
+	if (nmi_active)
+		nmi_wdt_start();
+	return 0;
+}
+
+static struct sysdev_class nmi_sysclass = {
+	.name		= DRV_NAME,
+	.resume		= nmi_wdt_resume,
+	.suspend	= nmi_wdt_suspend,
+};
+
+static struct sys_device device_nmi_wdt = {
+	.id	= 0,
+	.cls	= &nmi_sysclass,
+};
+
+static int __init init_nmi_wdt_sysfs(void)
+{
+	int error;
+
+	if (!nmi_active)
+		return 0;
+
+	error = sysdev_class_register(&nmi_sysclass);
+	if (!error)
+		error = sysdev_register(&device_nmi_wdt);
+	return error;
+}
+late_initcall(init_nmi_wdt_sysfs);
+
+#endif	/* CONFIG_PM */
+
+
+asmlinkage notrace void do_nmi(struct pt_regs *fp)
+{
+	unsigned int cpu = smp_processor_id();
+	nmi_enter();
+
+	cpu_pda[cpu].__nmi_count += 1;
+
+	if (cpu == nmi_wdt_cpu) {
+		/* CoreB goes here first */
+
+		/* reload the WDOG_STAT */
+		nmi_wdt_keepalive();
+
+		/* clear nmi interrupt for CoreB */
+		nmi_wdt_stop();
+		nmi_wdt_clear();
+
+		/* trigger NMI interrupt of CoreA */
+		send_corea_nmi();
+
+		/* waiting CoreB to enter NMI */
+		wait_nmi_event(COREA_ENTER_NMI);
+
+		/* recover WDOGA's settings */
+		restore_corea_nmi();
+
+		save_corelock();
+
+		/* corelock is save/cleared, CoreA is dummping messages */
+
+		wait_nmi_event(COREA_EXIT_NMI);
+	} else {
+		/* OK, CoreA entered NMI */
+		set_nmi_event(COREA_ENTER_NMI);
+	}
+
+	pr_emerg("\nNMI Watchdog detected LOCKUP, dump for CPU %d\n", cpu);
+	dump_bfin_process(fp);
+	dump_bfin_mem(fp);
+	show_regs(fp);
+	dump_bfin_trace_buffer();
+	show_stack(current, (unsigned long *)fp);
+
+	if (cpu == nmi_wdt_cpu) {
+		pr_emerg("This fault is not recoverable, sorry!\n");
+
+		/* CoreA dump finished, restore the corelock */
+		restore_corelock();
+
+		set_nmi_event(COREB_EXIT_NMI);
+	} else {
+		/* CoreB dump finished, notice the CoreA we are done */
+		set_nmi_event(COREA_EXIT_NMI);
+
+		/* synchronize with CoreA */
+		wait_nmi_event(COREB_EXIT_NMI);
+	}
+
+	nmi_exit();
+}

+ 4 - 0
arch/blackfin/kernel/time-ts.c

@@ -21,6 +21,7 @@
 #include <asm/blackfin.h>
 #include <asm/blackfin.h>
 #include <asm/time.h>
 #include <asm/time.h>
 #include <asm/gptimers.h>
 #include <asm/gptimers.h>
+#include <asm/nmi.h>
 
 
 /* Accelerators for sched_clock()
 /* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
  * convert from cycles(64bits) => nanoseconds (64bits)
@@ -309,6 +310,9 @@ irqreturn_t bfin_coretmr_interrupt(int irq, void *dev_id)
 
 
 	smp_mb();
 	smp_mb();
 	evt->event_handler(evt);
 	evt->event_handler(evt);
+
+	touch_nmi_watchdog();
+
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
 
 

+ 17 - 1
arch/blackfin/mach-common/interrupt.S

@@ -194,12 +194,28 @@ ENTRY(_evt_ivhw)
 ENDPROC(_evt_ivhw)
 ENDPROC(_evt_ivhw)
 
 
 /* Interrupt routine for evt2 (NMI).
 /* Interrupt routine for evt2 (NMI).
- * We don't actually use this, so just return.
  * For inner circle type details, please see:
  * For inner circle type details, please see:
  * http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:nmi
  * http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:nmi
  */
  */
 ENTRY(_evt_nmi)
 ENTRY(_evt_nmi)
+#ifndef CONFIG_NMI_WATCHDOG
 .weak _evt_nmi
 .weak _evt_nmi
+#else
+	/* Not take account of CPLBs, this handler will not return */
+	SAVE_ALL_SYS
+	r0 = sp;
+	r1 = retn;
+	[sp + PT_PC] = r1;
+	trace_buffer_save(p4,r5);
+
+	ANOMALY_283_315_WORKAROUND(p4, r5)
+
+	SP += -12;
+	call _do_nmi;
+	SP += 12;
+1:
+	jump 1b;
+#endif
 	rtn;
 	rtn;
 ENDPROC(_evt_nmi)
 ENDPROC(_evt_nmi)