watchdog.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Watchdog support on powerpc systems.
  4. *
  5. * Copyright 2017, IBM Corporation.
  6. *
  7. * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
  8. */
  9. #define pr_fmt(fmt) "watchdog: " fmt
  10. #include <linux/kernel.h>
  11. #include <linux/param.h>
  12. #include <linux/init.h>
  13. #include <linux/percpu.h>
  14. #include <linux/cpu.h>
  15. #include <linux/nmi.h>
  16. #include <linux/module.h>
  17. #include <linux/export.h>
  18. #include <linux/kprobes.h>
  19. #include <linux/hardirq.h>
  20. #include <linux/reboot.h>
  21. #include <linux/slab.h>
  22. #include <linux/kdebug.h>
  23. #include <linux/sched/debug.h>
  24. #include <linux/delay.h>
  25. #include <linux/smp.h>
  26. #include <asm/paca.h>
  27. /*
  28. * The powerpc watchdog ensures that each CPU is able to service timers.
  29. * The watchdog sets up a simple timer on each CPU to run once per timer
  30. * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
  31. * the heartbeat.
  32. *
  33. * Then there are two systems to check that the heartbeat is still running.
  34. * The local soft-NMI, and the SMP checker.
  35. *
  36. * The soft-NMI checker can detect lockups on the local CPU. When interrupts
  37. * are disabled with local_irq_disable(), platforms that use soft-masking
  38. * can leave hardware interrupts enabled and handle them with a masked
  39. * interrupt handler. The masked handler can send the timer interrupt to the
  40. * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
  41. * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
  42. *
  43. * The soft-NMI checker will compare the heartbeat timestamp for this CPU
  44. * with the current time, and take action if the difference exceeds the
  45. * watchdog threshold.
  46. *
  47. * The limitation of the soft-NMI watchdog is that it does not work when
  48. * interrupts are hard disabled or otherwise not being serviced. This is
  49. * solved by also having a SMP watchdog where all CPUs check all other
  50. * CPUs heartbeat.
  51. *
  52. * The SMP checker can detect lockups on other CPUs. A gobal "pending"
  53. * cpumask is kept, containing all CPUs which enable the watchdog. Each
  54. * CPU clears their pending bit in their heartbeat timer. When the bitmask
  55. * becomes empty, the last CPU to clear its pending bit updates a global
  56. * timestamp and refills the pending bitmask.
  57. *
  58. * In the heartbeat timer, if any CPU notices that the global timestamp has
  59. * not been updated for a period exceeding the watchdog threshold, then it
  60. * means the CPU(s) with their bit still set in the pending mask have had
  61. * their heartbeat stop, and action is taken.
  62. *
  63. * Some platforms implement true NMI IPIs, which can by used by the SMP
  64. * watchdog to detect an unresponsive CPU and pull it out of its stuck
  65. * state with the NMI IPI, to get crash/debug data from it. This way the
  66. * SMP watchdog can detect hardware interrupts off lockups.
  67. */
  68. static cpumask_t wd_cpus_enabled __read_mostly;
  69. static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
  70. static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
  71. static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
  72. static DEFINE_PER_CPU(struct timer_list, wd_timer);
  73. static DEFINE_PER_CPU(u64, wd_timer_tb);
  74. /* SMP checker bits */
  75. static unsigned long __wd_smp_lock;
  76. static cpumask_t wd_smp_cpus_pending;
  77. static cpumask_t wd_smp_cpus_stuck;
  78. static u64 wd_smp_last_reset_tb;
  79. static inline void wd_smp_lock(unsigned long *flags)
  80. {
  81. /*
  82. * Avoid locking layers if possible.
  83. * This may be called from low level interrupt handlers at some
  84. * point in future.
  85. */
  86. raw_local_irq_save(*flags);
  87. hard_irq_disable(); /* Make it soft-NMI safe */
  88. while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
  89. raw_local_irq_restore(*flags);
  90. spin_until_cond(!test_bit(0, &__wd_smp_lock));
  91. raw_local_irq_save(*flags);
  92. hard_irq_disable();
  93. }
  94. }
  95. static inline void wd_smp_unlock(unsigned long *flags)
  96. {
  97. clear_bit_unlock(0, &__wd_smp_lock);
  98. raw_local_irq_restore(*flags);
  99. }
  100. static void wd_lockup_ipi(struct pt_regs *regs)
  101. {
  102. pr_emerg("CPU %d Hard LOCKUP\n", raw_smp_processor_id());
  103. print_modules();
  104. print_irqtrace_events(current);
  105. if (regs)
  106. show_regs(regs);
  107. else
  108. dump_stack();
  109. /* Do not panic from here because that can recurse into NMI IPI layer */
  110. }
  111. static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
  112. {
  113. cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
  114. cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
  115. if (cpumask_empty(&wd_smp_cpus_pending)) {
  116. wd_smp_last_reset_tb = tb;
  117. cpumask_andnot(&wd_smp_cpus_pending,
  118. &wd_cpus_enabled,
  119. &wd_smp_cpus_stuck);
  120. }
  121. }
  122. static void set_cpu_stuck(int cpu, u64 tb)
  123. {
  124. set_cpumask_stuck(cpumask_of(cpu), tb);
  125. }
  126. static void watchdog_smp_panic(int cpu, u64 tb)
  127. {
  128. unsigned long flags;
  129. int c;
  130. wd_smp_lock(&flags);
  131. /* Double check some things under lock */
  132. if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
  133. goto out;
  134. if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
  135. goto out;
  136. if (cpumask_weight(&wd_smp_cpus_pending) == 0)
  137. goto out;
  138. pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
  139. cpu, cpumask_pr_args(&wd_smp_cpus_pending));
  140. if (!sysctl_hardlockup_all_cpu_backtrace) {
  141. /*
  142. * Try to trigger the stuck CPUs, unless we are going to
  143. * get a backtrace on all of them anyway.
  144. */
  145. for_each_cpu(c, &wd_smp_cpus_pending) {
  146. if (c == cpu)
  147. continue;
  148. smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
  149. }
  150. smp_flush_nmi_ipi(1000000);
  151. }
  152. /* Take the stuck CPUs out of the watch group */
  153. set_cpumask_stuck(&wd_smp_cpus_pending, tb);
  154. wd_smp_unlock(&flags);
  155. printk_safe_flush();
  156. /*
  157. * printk_safe_flush() seems to require another print
  158. * before anything actually goes out to console.
  159. */
  160. if (sysctl_hardlockup_all_cpu_backtrace)
  161. trigger_allbutself_cpu_backtrace();
  162. if (hardlockup_panic)
  163. nmi_panic(NULL, "Hard LOCKUP");
  164. return;
  165. out:
  166. wd_smp_unlock(&flags);
  167. }
  168. static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
  169. {
  170. if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
  171. if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
  172. unsigned long flags;
  173. pr_emerg("CPU %d became unstuck\n", cpu);
  174. wd_smp_lock(&flags);
  175. cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
  176. wd_smp_unlock(&flags);
  177. }
  178. return;
  179. }
  180. cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
  181. if (cpumask_empty(&wd_smp_cpus_pending)) {
  182. unsigned long flags;
  183. wd_smp_lock(&flags);
  184. if (cpumask_empty(&wd_smp_cpus_pending)) {
  185. wd_smp_last_reset_tb = tb;
  186. cpumask_andnot(&wd_smp_cpus_pending,
  187. &wd_cpus_enabled,
  188. &wd_smp_cpus_stuck);
  189. }
  190. wd_smp_unlock(&flags);
  191. }
  192. }
  193. static void watchdog_timer_interrupt(int cpu)
  194. {
  195. u64 tb = get_tb();
  196. per_cpu(wd_timer_tb, cpu) = tb;
  197. wd_smp_clear_cpu_pending(cpu, tb);
  198. if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
  199. watchdog_smp_panic(cpu, tb);
  200. }
  201. void soft_nmi_interrupt(struct pt_regs *regs)
  202. {
  203. unsigned long flags;
  204. int cpu = raw_smp_processor_id();
  205. u64 tb;
  206. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  207. return;
  208. nmi_enter();
  209. __this_cpu_inc(irq_stat.soft_nmi_irqs);
  210. tb = get_tb();
  211. if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
  212. per_cpu(wd_timer_tb, cpu) = tb;
  213. wd_smp_lock(&flags);
  214. if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
  215. wd_smp_unlock(&flags);
  216. goto out;
  217. }
  218. set_cpu_stuck(cpu, tb);
  219. pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", cpu, (void *)regs->nip);
  220. print_modules();
  221. print_irqtrace_events(current);
  222. show_regs(regs);
  223. wd_smp_unlock(&flags);
  224. if (sysctl_hardlockup_all_cpu_backtrace)
  225. trigger_allbutself_cpu_backtrace();
  226. if (hardlockup_panic)
  227. nmi_panic(regs, "Hard LOCKUP");
  228. }
  229. if (wd_panic_timeout_tb < 0x7fffffff)
  230. mtspr(SPRN_DEC, wd_panic_timeout_tb);
  231. out:
  232. nmi_exit();
  233. }
  234. static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
  235. {
  236. t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
  237. if (wd_timer_period_ms > 1000)
  238. t->expires = __round_jiffies_up(t->expires, cpu);
  239. add_timer_on(t, cpu);
  240. }
  241. static void wd_timer_fn(struct timer_list *t)
  242. {
  243. int cpu = smp_processor_id();
  244. watchdog_timer_interrupt(cpu);
  245. wd_timer_reset(cpu, t);
  246. }
  247. void arch_touch_nmi_watchdog(void)
  248. {
  249. unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
  250. int cpu = smp_processor_id();
  251. u64 tb = get_tb();
  252. if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
  253. per_cpu(wd_timer_tb, cpu) = tb;
  254. wd_smp_clear_cpu_pending(cpu, tb);
  255. }
  256. }
  257. EXPORT_SYMBOL(arch_touch_nmi_watchdog);
  258. static void start_watchdog_timer_on(unsigned int cpu)
  259. {
  260. struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
  261. per_cpu(wd_timer_tb, cpu) = get_tb();
  262. timer_setup(t, wd_timer_fn, TIMER_PINNED);
  263. wd_timer_reset(cpu, t);
  264. }
  265. static void stop_watchdog_timer_on(unsigned int cpu)
  266. {
  267. struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
  268. del_timer_sync(t);
  269. }
  270. static int start_wd_on_cpu(unsigned int cpu)
  271. {
  272. unsigned long flags;
  273. if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
  274. WARN_ON(1);
  275. return 0;
  276. }
  277. if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
  278. return 0;
  279. if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
  280. return 0;
  281. wd_smp_lock(&flags);
  282. cpumask_set_cpu(cpu, &wd_cpus_enabled);
  283. if (cpumask_weight(&wd_cpus_enabled) == 1) {
  284. cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
  285. wd_smp_last_reset_tb = get_tb();
  286. }
  287. wd_smp_unlock(&flags);
  288. start_watchdog_timer_on(cpu);
  289. return 0;
  290. }
  291. static int stop_wd_on_cpu(unsigned int cpu)
  292. {
  293. unsigned long flags;
  294. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  295. return 0; /* Can happen in CPU unplug case */
  296. stop_watchdog_timer_on(cpu);
  297. wd_smp_lock(&flags);
  298. cpumask_clear_cpu(cpu, &wd_cpus_enabled);
  299. wd_smp_unlock(&flags);
  300. wd_smp_clear_cpu_pending(cpu, get_tb());
  301. return 0;
  302. }
  303. static void watchdog_calc_timeouts(void)
  304. {
  305. wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
  306. /* Have the SMP detector trigger a bit later */
  307. wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
  308. /* 2/5 is the factor that the perf based detector uses */
  309. wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
  310. }
  311. void watchdog_nmi_stop(void)
  312. {
  313. int cpu;
  314. for_each_cpu(cpu, &wd_cpus_enabled)
  315. stop_wd_on_cpu(cpu);
  316. }
  317. void watchdog_nmi_start(void)
  318. {
  319. int cpu;
  320. watchdog_calc_timeouts();
  321. for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
  322. start_wd_on_cpu(cpu);
  323. }
  324. /*
  325. * Invoked from core watchdog init.
  326. */
  327. int __init watchdog_nmi_probe(void)
  328. {
  329. int err;
  330. err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
  331. "powerpc/watchdog:online",
  332. start_wd_on_cpu, stop_wd_on_cpu);
  333. if (err < 0) {
  334. pr_warn("could not be initialized");
  335. return err;
  336. }
  337. return 0;
  338. }