watchdog.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Watchdog support on powerpc systems.
  4. *
  5. * Copyright 2017, IBM Corporation.
  6. *
  7. * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
  8. */
  9. #define pr_fmt(fmt) "watchdog: " fmt
  10. #include <linux/kernel.h>
  11. #include <linux/param.h>
  12. #include <linux/init.h>
  13. #include <linux/percpu.h>
  14. #include <linux/cpu.h>
  15. #include <linux/nmi.h>
  16. #include <linux/module.h>
  17. #include <linux/export.h>
  18. #include <linux/kprobes.h>
  19. #include <linux/hardirq.h>
  20. #include <linux/reboot.h>
  21. #include <linux/slab.h>
  22. #include <linux/kdebug.h>
  23. #include <linux/sched/debug.h>
  24. #include <linux/delay.h>
  25. #include <linux/smp.h>
  26. #include <asm/paca.h>
  27. /*
  28. * The powerpc watchdog ensures that each CPU is able to service timers.
  29. * The watchdog sets up a simple timer on each CPU to run once per timer
  30. * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
  31. * the heartbeat.
  32. *
  33. * Then there are two systems to check that the heartbeat is still running.
  34. * The local soft-NMI, and the SMP checker.
  35. *
  36. * The soft-NMI checker can detect lockups on the local CPU. When interrupts
  37. * are disabled with local_irq_disable(), platforms that use soft-masking
  38. * can leave hardware interrupts enabled and handle them with a masked
  39. * interrupt handler. The masked handler can send the timer interrupt to the
  40. * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
  41. * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
  42. *
  43. * The soft-NMI checker will compare the heartbeat timestamp for this CPU
  44. * with the current time, and take action if the difference exceeds the
  45. * watchdog threshold.
  46. *
  47. * The limitation of the soft-NMI watchdog is that it does not work when
  48. * interrupts are hard disabled or otherwise not being serviced. This is
  49. * solved by also having a SMP watchdog where all CPUs check all other
  50. * CPUs heartbeat.
  51. *
  52. * The SMP checker can detect lockups on other CPUs. A gobal "pending"
  53. * cpumask is kept, containing all CPUs which enable the watchdog. Each
  54. * CPU clears their pending bit in their heartbeat timer. When the bitmask
  55. * becomes empty, the last CPU to clear its pending bit updates a global
  56. * timestamp and refills the pending bitmask.
  57. *
  58. * In the heartbeat timer, if any CPU notices that the global timestamp has
  59. * not been updated for a period exceeding the watchdog threshold, then it
  60. * means the CPU(s) with their bit still set in the pending mask have had
  61. * their heartbeat stop, and action is taken.
  62. *
  63. * Some platforms implement true NMI IPIs, which can be used by the SMP
  64. * watchdog to detect an unresponsive CPU and pull it out of its stuck
  65. * state with the NMI IPI, to get crash/debug data from it. This way the
  66. * SMP watchdog can detect hardware interrupts off lockups.
  67. */
  68. static cpumask_t wd_cpus_enabled __read_mostly;
  69. static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
  70. static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
  71. static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
  72. static DEFINE_PER_CPU(struct timer_list, wd_timer);
  73. static DEFINE_PER_CPU(u64, wd_timer_tb);
  74. /* SMP checker bits */
  75. static unsigned long __wd_smp_lock;
  76. static cpumask_t wd_smp_cpus_pending;
  77. static cpumask_t wd_smp_cpus_stuck;
  78. static u64 wd_smp_last_reset_tb;
  79. static inline void wd_smp_lock(unsigned long *flags)
  80. {
  81. /*
  82. * Avoid locking layers if possible.
  83. * This may be called from low level interrupt handlers at some
  84. * point in future.
  85. */
  86. raw_local_irq_save(*flags);
  87. hard_irq_disable(); /* Make it soft-NMI safe */
  88. while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
  89. raw_local_irq_restore(*flags);
  90. spin_until_cond(!test_bit(0, &__wd_smp_lock));
  91. raw_local_irq_save(*flags);
  92. hard_irq_disable();
  93. }
  94. }
  95. static inline void wd_smp_unlock(unsigned long *flags)
  96. {
  97. clear_bit_unlock(0, &__wd_smp_lock);
  98. raw_local_irq_restore(*flags);
  99. }
  100. static void wd_lockup_ipi(struct pt_regs *regs)
  101. {
  102. int cpu = raw_smp_processor_id();
  103. u64 tb = get_tb();
  104. pr_emerg("CPU %d Hard LOCKUP\n", cpu);
  105. pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
  106. cpu, tb, per_cpu(wd_timer_tb, cpu),
  107. tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
  108. print_modules();
  109. print_irqtrace_events(current);
  110. if (regs)
  111. show_regs(regs);
  112. else
  113. dump_stack();
  114. /* Do not panic from here because that can recurse into NMI IPI layer */
  115. }
  116. static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
  117. {
  118. cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
  119. cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
  120. if (cpumask_empty(&wd_smp_cpus_pending)) {
  121. wd_smp_last_reset_tb = tb;
  122. cpumask_andnot(&wd_smp_cpus_pending,
  123. &wd_cpus_enabled,
  124. &wd_smp_cpus_stuck);
  125. }
  126. }
  127. static void set_cpu_stuck(int cpu, u64 tb)
  128. {
  129. set_cpumask_stuck(cpumask_of(cpu), tb);
  130. }
  131. static void watchdog_smp_panic(int cpu, u64 tb)
  132. {
  133. unsigned long flags;
  134. int c;
  135. wd_smp_lock(&flags);
  136. /* Double check some things under lock */
  137. if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
  138. goto out;
  139. if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
  140. goto out;
  141. if (cpumask_weight(&wd_smp_cpus_pending) == 0)
  142. goto out;
  143. pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
  144. cpu, cpumask_pr_args(&wd_smp_cpus_pending));
  145. pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
  146. cpu, tb, wd_smp_last_reset_tb,
  147. tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000);
  148. if (!sysctl_hardlockup_all_cpu_backtrace) {
  149. /*
  150. * Try to trigger the stuck CPUs, unless we are going to
  151. * get a backtrace on all of them anyway.
  152. */
  153. for_each_cpu(c, &wd_smp_cpus_pending) {
  154. if (c == cpu)
  155. continue;
  156. smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
  157. }
  158. smp_flush_nmi_ipi(1000000);
  159. }
  160. /* Take the stuck CPUs out of the watch group */
  161. set_cpumask_stuck(&wd_smp_cpus_pending, tb);
  162. wd_smp_unlock(&flags);
  163. printk_safe_flush();
  164. /*
  165. * printk_safe_flush() seems to require another print
  166. * before anything actually goes out to console.
  167. */
  168. if (sysctl_hardlockup_all_cpu_backtrace)
  169. trigger_allbutself_cpu_backtrace();
  170. if (hardlockup_panic)
  171. nmi_panic(NULL, "Hard LOCKUP");
  172. return;
  173. out:
  174. wd_smp_unlock(&flags);
  175. }
  176. static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
  177. {
  178. if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
  179. if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
  180. struct pt_regs *regs = get_irq_regs();
  181. unsigned long flags;
  182. wd_smp_lock(&flags);
  183. pr_emerg("CPU %d became unstuck TB:%lld\n",
  184. cpu, tb);
  185. print_irqtrace_events(current);
  186. if (regs)
  187. show_regs(regs);
  188. else
  189. dump_stack();
  190. cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
  191. wd_smp_unlock(&flags);
  192. }
  193. return;
  194. }
  195. cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
  196. if (cpumask_empty(&wd_smp_cpus_pending)) {
  197. unsigned long flags;
  198. wd_smp_lock(&flags);
  199. if (cpumask_empty(&wd_smp_cpus_pending)) {
  200. wd_smp_last_reset_tb = tb;
  201. cpumask_andnot(&wd_smp_cpus_pending,
  202. &wd_cpus_enabled,
  203. &wd_smp_cpus_stuck);
  204. }
  205. wd_smp_unlock(&flags);
  206. }
  207. }
  208. static void watchdog_timer_interrupt(int cpu)
  209. {
  210. u64 tb = get_tb();
  211. per_cpu(wd_timer_tb, cpu) = tb;
  212. wd_smp_clear_cpu_pending(cpu, tb);
  213. if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
  214. watchdog_smp_panic(cpu, tb);
  215. }
  216. void soft_nmi_interrupt(struct pt_regs *regs)
  217. {
  218. unsigned long flags;
  219. int cpu = raw_smp_processor_id();
  220. u64 tb;
  221. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  222. return;
  223. nmi_enter();
  224. __this_cpu_inc(irq_stat.soft_nmi_irqs);
  225. tb = get_tb();
  226. if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
  227. wd_smp_lock(&flags);
  228. if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
  229. wd_smp_unlock(&flags);
  230. goto out;
  231. }
  232. set_cpu_stuck(cpu, tb);
  233. pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
  234. cpu, (void *)regs->nip);
  235. pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
  236. cpu, tb, per_cpu(wd_timer_tb, cpu),
  237. tb_to_ns(tb - per_cpu(wd_timer_tb, cpu)) / 1000000);
  238. print_modules();
  239. print_irqtrace_events(current);
  240. show_regs(regs);
  241. wd_smp_unlock(&flags);
  242. if (sysctl_hardlockup_all_cpu_backtrace)
  243. trigger_allbutself_cpu_backtrace();
  244. if (hardlockup_panic)
  245. nmi_panic(regs, "Hard LOCKUP");
  246. }
  247. if (wd_panic_timeout_tb < 0x7fffffff)
  248. mtspr(SPRN_DEC, wd_panic_timeout_tb);
  249. out:
  250. nmi_exit();
  251. }
  252. static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
  253. {
  254. t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
  255. if (wd_timer_period_ms > 1000)
  256. t->expires = __round_jiffies_up(t->expires, cpu);
  257. add_timer_on(t, cpu);
  258. }
  259. static void wd_timer_fn(struct timer_list *t)
  260. {
  261. int cpu = smp_processor_id();
  262. watchdog_timer_interrupt(cpu);
  263. wd_timer_reset(cpu, t);
  264. }
  265. void arch_touch_nmi_watchdog(void)
  266. {
  267. unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
  268. int cpu = smp_processor_id();
  269. u64 tb = get_tb();
  270. if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
  271. per_cpu(wd_timer_tb, cpu) = tb;
  272. wd_smp_clear_cpu_pending(cpu, tb);
  273. }
  274. }
  275. EXPORT_SYMBOL(arch_touch_nmi_watchdog);
  276. static void start_watchdog_timer_on(unsigned int cpu)
  277. {
  278. struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
  279. per_cpu(wd_timer_tb, cpu) = get_tb();
  280. timer_setup(t, wd_timer_fn, TIMER_PINNED);
  281. wd_timer_reset(cpu, t);
  282. }
  283. static void stop_watchdog_timer_on(unsigned int cpu)
  284. {
  285. struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
  286. del_timer_sync(t);
  287. }
  288. static int start_wd_on_cpu(unsigned int cpu)
  289. {
  290. unsigned long flags;
  291. if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
  292. WARN_ON(1);
  293. return 0;
  294. }
  295. if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
  296. return 0;
  297. if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
  298. return 0;
  299. wd_smp_lock(&flags);
  300. cpumask_set_cpu(cpu, &wd_cpus_enabled);
  301. if (cpumask_weight(&wd_cpus_enabled) == 1) {
  302. cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
  303. wd_smp_last_reset_tb = get_tb();
  304. }
  305. wd_smp_unlock(&flags);
  306. start_watchdog_timer_on(cpu);
  307. return 0;
  308. }
  309. static int stop_wd_on_cpu(unsigned int cpu)
  310. {
  311. unsigned long flags;
  312. if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
  313. return 0; /* Can happen in CPU unplug case */
  314. stop_watchdog_timer_on(cpu);
  315. wd_smp_lock(&flags);
  316. cpumask_clear_cpu(cpu, &wd_cpus_enabled);
  317. wd_smp_unlock(&flags);
  318. wd_smp_clear_cpu_pending(cpu, get_tb());
  319. return 0;
  320. }
  321. static void watchdog_calc_timeouts(void)
  322. {
  323. wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
  324. /* Have the SMP detector trigger a bit later */
  325. wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
  326. /* 2/5 is the factor that the perf based detector uses */
  327. wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
  328. }
  329. void watchdog_nmi_stop(void)
  330. {
  331. int cpu;
  332. for_each_cpu(cpu, &wd_cpus_enabled)
  333. stop_wd_on_cpu(cpu);
  334. }
  335. void watchdog_nmi_start(void)
  336. {
  337. int cpu;
  338. watchdog_calc_timeouts();
  339. for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
  340. start_wd_on_cpu(cpu);
  341. }
  342. /*
  343. * Invoked from core watchdog init.
  344. */
  345. int __init watchdog_nmi_probe(void)
  346. {
  347. int err;
  348. err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
  349. "powerpc/watchdog:online",
  350. start_wd_on_cpu, stop_wd_on_cpu);
  351. if (err < 0) {
  352. pr_warn("could not be initialized");
  353. return err;
  354. }
  355. return 0;
  356. }