nmi.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. /*
  2. * Copyright (C) 1991, 1992 Linus Torvalds
  3. * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  4. * Copyright (C) 2011 Don Zickus Red Hat, Inc.
  5. *
  6. * Pentium III FXSR, SSE support
  7. * Gareth Hughes <gareth@valinux.com>, May 2000
  8. */
  9. /*
  10. * Handle hardware traps and faults.
  11. */
  12. #include <linux/spinlock.h>
  13. #include <linux/kprobes.h>
  14. #include <linux/kdebug.h>
  15. #include <linux/nmi.h>
  16. #include <linux/debugfs.h>
  17. #include <linux/delay.h>
  18. #include <linux/hardirq.h>
  19. #include <linux/slab.h>
  20. #include <linux/export.h>
  21. #if defined(CONFIG_EDAC)
  22. #include <linux/edac.h>
  23. #endif
  24. #include <linux/atomic.h>
  25. #include <asm/traps.h>
  26. #include <asm/mach_traps.h>
  27. #include <asm/nmi.h>
  28. #include <asm/x86_init.h>
  29. #define CREATE_TRACE_POINTS
  30. #include <trace/events/nmi.h>
  31. struct nmi_desc {
  32. spinlock_t lock;
  33. struct list_head head;
  34. };
  35. static struct nmi_desc nmi_desc[NMI_MAX] =
  36. {
  37. {
  38. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
  39. .head = LIST_HEAD_INIT(nmi_desc[0].head),
  40. },
  41. {
  42. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
  43. .head = LIST_HEAD_INIT(nmi_desc[1].head),
  44. },
  45. {
  46. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
  47. .head = LIST_HEAD_INIT(nmi_desc[2].head),
  48. },
  49. {
  50. .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
  51. .head = LIST_HEAD_INIT(nmi_desc[3].head),
  52. },
  53. };
  54. struct nmi_stats {
  55. unsigned int normal;
  56. unsigned int unknown;
  57. unsigned int external;
  58. unsigned int swallow;
  59. };
  60. static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
  61. static int ignore_nmis;
  62. int unknown_nmi_panic;
  63. /*
  64. * Prevent NMI reason port (0x61) being accessed simultaneously, can
  65. * only be used in NMI handler.
  66. */
  67. static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
  68. static int __init setup_unknown_nmi_panic(char *str)
  69. {
  70. unknown_nmi_panic = 1;
  71. return 1;
  72. }
  73. __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
  74. #define nmi_to_desc(type) (&nmi_desc[type])
  75. static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
  76. static int __init nmi_warning_debugfs(void)
  77. {
  78. debugfs_create_u64("nmi_longest_ns", 0644,
  79. arch_debugfs_dir, &nmi_longest_ns);
  80. return 0;
  81. }
  82. fs_initcall(nmi_warning_debugfs);
  83. static void nmi_max_handler(struct irq_work *w)
  84. {
  85. struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
  86. int remainder_ns, decimal_msecs;
  87. u64 whole_msecs = ACCESS_ONCE(a->max_duration);
  88. remainder_ns = do_div(whole_msecs, (1000 * 1000));
  89. decimal_msecs = remainder_ns / 1000;
  90. printk_ratelimited(KERN_INFO
  91. "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
  92. a->handler, whole_msecs, decimal_msecs);
  93. }
  94. static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
  95. {
  96. struct nmi_desc *desc = nmi_to_desc(type);
  97. struct nmiaction *a;
  98. int handled=0;
  99. rcu_read_lock();
  100. /*
  101. * NMIs are edge-triggered, which means if you have enough
  102. * of them concurrently, you can lose some because only one
  103. * can be latched at any given time. Walk the whole list
  104. * to handle those situations.
  105. */
  106. list_for_each_entry_rcu(a, &desc->head, list) {
  107. int thishandled;
  108. u64 delta;
  109. delta = sched_clock();
  110. thishandled = a->handler(type, regs);
  111. handled += thishandled;
  112. delta = sched_clock() - delta;
  113. trace_nmi_handler(a->handler, (int)delta, thishandled);
  114. if (delta < nmi_longest_ns || delta < a->max_duration)
  115. continue;
  116. a->max_duration = delta;
  117. irq_work_queue(&a->irq_work);
  118. }
  119. rcu_read_unlock();
  120. /* return total number of NMI events handled */
  121. return handled;
  122. }
  123. int __register_nmi_handler(unsigned int type, struct nmiaction *action)
  124. {
  125. struct nmi_desc *desc = nmi_to_desc(type);
  126. unsigned long flags;
  127. if (!action->handler)
  128. return -EINVAL;
  129. init_irq_work(&action->irq_work, nmi_max_handler);
  130. spin_lock_irqsave(&desc->lock, flags);
  131. /*
  132. * most handlers of type NMI_UNKNOWN never return because
  133. * they just assume the NMI is theirs. Just a sanity check
  134. * to manage expectations
  135. */
  136. WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
  137. WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
  138. WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
  139. /*
  140. * some handlers need to be executed first otherwise a fake
  141. * event confuses some handlers (kdump uses this flag)
  142. */
  143. if (action->flags & NMI_FLAG_FIRST)
  144. list_add_rcu(&action->list, &desc->head);
  145. else
  146. list_add_tail_rcu(&action->list, &desc->head);
  147. spin_unlock_irqrestore(&desc->lock, flags);
  148. return 0;
  149. }
  150. EXPORT_SYMBOL(__register_nmi_handler);
  151. void unregister_nmi_handler(unsigned int type, const char *name)
  152. {
  153. struct nmi_desc *desc = nmi_to_desc(type);
  154. struct nmiaction *n;
  155. unsigned long flags;
  156. spin_lock_irqsave(&desc->lock, flags);
  157. list_for_each_entry_rcu(n, &desc->head, list) {
  158. /*
  159. * the name passed in to describe the nmi handler
  160. * is used as the lookup key
  161. */
  162. if (!strcmp(n->name, name)) {
  163. WARN(in_nmi(),
  164. "Trying to free NMI (%s) from NMI context!\n", n->name);
  165. list_del_rcu(&n->list);
  166. break;
  167. }
  168. }
  169. spin_unlock_irqrestore(&desc->lock, flags);
  170. synchronize_rcu();
  171. }
  172. EXPORT_SYMBOL_GPL(unregister_nmi_handler);
  173. static __kprobes void
  174. pci_serr_error(unsigned char reason, struct pt_regs *regs)
  175. {
  176. /* check to see if anyone registered against these types of errors */
  177. if (nmi_handle(NMI_SERR, regs, false))
  178. return;
  179. pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
  180. reason, smp_processor_id());
  181. /*
  182. * On some machines, PCI SERR line is used to report memory
  183. * errors. EDAC makes use of it.
  184. */
  185. #if defined(CONFIG_EDAC)
  186. if (edac_handler_set()) {
  187. edac_atomic_assert_error();
  188. return;
  189. }
  190. #endif
  191. if (panic_on_unrecovered_nmi)
  192. panic("NMI: Not continuing");
  193. pr_emerg("Dazed and confused, but trying to continue\n");
  194. /* Clear and disable the PCI SERR error line. */
  195. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
  196. outb(reason, NMI_REASON_PORT);
  197. }
  198. static __kprobes void
  199. io_check_error(unsigned char reason, struct pt_regs *regs)
  200. {
  201. unsigned long i;
  202. /* check to see if anyone registered against these types of errors */
  203. if (nmi_handle(NMI_IO_CHECK, regs, false))
  204. return;
  205. pr_emerg(
  206. "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
  207. reason, smp_processor_id());
  208. show_regs(regs);
  209. if (panic_on_io_nmi)
  210. panic("NMI IOCK error: Not continuing");
  211. /* Re-enable the IOCK line, wait for a few seconds */
  212. reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
  213. outb(reason, NMI_REASON_PORT);
  214. i = 20000;
  215. while (--i) {
  216. touch_nmi_watchdog();
  217. udelay(100);
  218. }
  219. reason &= ~NMI_REASON_CLEAR_IOCHK;
  220. outb(reason, NMI_REASON_PORT);
  221. }
  222. static __kprobes void
  223. unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
  224. {
  225. int handled;
  226. /*
  227. * Use 'false' as back-to-back NMIs are dealt with one level up.
  228. * Of course this makes having multiple 'unknown' handlers useless
  229. * as only the first one is ever run (unless it can actually determine
  230. * if it caused the NMI)
  231. */
  232. handled = nmi_handle(NMI_UNKNOWN, regs, false);
  233. if (handled) {
  234. __this_cpu_add(nmi_stats.unknown, handled);
  235. return;
  236. }
  237. __this_cpu_add(nmi_stats.unknown, 1);
  238. pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
  239. reason, smp_processor_id());
  240. pr_emerg("Do you have a strange power saving mode enabled?\n");
  241. if (unknown_nmi_panic || panic_on_unrecovered_nmi)
  242. panic("NMI: Not continuing");
  243. pr_emerg("Dazed and confused, but trying to continue\n");
  244. }
  245. static DEFINE_PER_CPU(bool, swallow_nmi);
  246. static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
  247. static __kprobes void default_do_nmi(struct pt_regs *regs)
  248. {
  249. unsigned char reason = 0;
  250. int handled;
  251. bool b2b = false;
  252. /*
  253. * CPU-specific NMI must be processed before non-CPU-specific
  254. * NMI, otherwise we may lose it, because the CPU-specific
  255. * NMI can not be detected/processed on other CPUs.
  256. */
  257. /*
  258. * Back-to-back NMIs are interesting because they can either
  259. * be two NMI or more than two NMIs (any thing over two is dropped
  260. * due to NMI being edge-triggered). If this is the second half
  261. * of the back-to-back NMI, assume we dropped things and process
  262. * more handlers. Otherwise reset the 'swallow' NMI behaviour
  263. */
  264. if (regs->ip == __this_cpu_read(last_nmi_rip))
  265. b2b = true;
  266. else
  267. __this_cpu_write(swallow_nmi, false);
  268. __this_cpu_write(last_nmi_rip, regs->ip);
  269. handled = nmi_handle(NMI_LOCAL, regs, b2b);
  270. __this_cpu_add(nmi_stats.normal, handled);
  271. if (handled) {
  272. /*
  273. * There are cases when a NMI handler handles multiple
  274. * events in the current NMI. One of these events may
  275. * be queued for in the next NMI. Because the event is
  276. * already handled, the next NMI will result in an unknown
  277. * NMI. Instead lets flag this for a potential NMI to
  278. * swallow.
  279. */
  280. if (handled > 1)
  281. __this_cpu_write(swallow_nmi, true);
  282. return;
  283. }
  284. /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
  285. raw_spin_lock(&nmi_reason_lock);
  286. reason = x86_platform.get_nmi_reason();
  287. if (reason & NMI_REASON_MASK) {
  288. if (reason & NMI_REASON_SERR)
  289. pci_serr_error(reason, regs);
  290. else if (reason & NMI_REASON_IOCHK)
  291. io_check_error(reason, regs);
  292. #ifdef CONFIG_X86_32
  293. /*
  294. * Reassert NMI in case it became active
  295. * meanwhile as it's edge-triggered:
  296. */
  297. reassert_nmi();
  298. #endif
  299. __this_cpu_add(nmi_stats.external, 1);
  300. raw_spin_unlock(&nmi_reason_lock);
  301. return;
  302. }
  303. raw_spin_unlock(&nmi_reason_lock);
  304. /*
  305. * Only one NMI can be latched at a time. To handle
  306. * this we may process multiple nmi handlers at once to
  307. * cover the case where an NMI is dropped. The downside
  308. * to this approach is we may process an NMI prematurely,
  309. * while its real NMI is sitting latched. This will cause
  310. * an unknown NMI on the next run of the NMI processing.
  311. *
  312. * We tried to flag that condition above, by setting the
  313. * swallow_nmi flag when we process more than one event.
  314. * This condition is also only present on the second half
  315. * of a back-to-back NMI, so we flag that condition too.
  316. *
  317. * If both are true, we assume we already processed this
  318. * NMI previously and we swallow it. Otherwise we reset
  319. * the logic.
  320. *
  321. * There are scenarios where we may accidentally swallow
  322. * a 'real' unknown NMI. For example, while processing
  323. * a perf NMI another perf NMI comes in along with a
  324. * 'real' unknown NMI. These two NMIs get combined into
  325. * one (as descibed above). When the next NMI gets
  326. * processed, it will be flagged by perf as handled, but
  327. * noone will know that there was a 'real' unknown NMI sent
  328. * also. As a result it gets swallowed. Or if the first
  329. * perf NMI returns two events handled then the second
  330. * NMI will get eaten by the logic below, again losing a
  331. * 'real' unknown NMI. But this is the best we can do
  332. * for now.
  333. */
  334. if (b2b && __this_cpu_read(swallow_nmi))
  335. __this_cpu_add(nmi_stats.swallow, 1);
  336. else
  337. unknown_nmi_error(reason, regs);
  338. }
  339. /*
  340. * NMIs can hit breakpoints which will cause it to lose its
  341. * NMI context with the CPU when the breakpoint does an iret.
  342. */
  343. #ifdef CONFIG_X86_32
  344. /*
  345. * For i386, NMIs use the same stack as the kernel, and we can
  346. * add a workaround to the iret problem in C (preventing nested
  347. * NMIs if an NMI takes a trap). Simply have 3 states the NMI
  348. * can be in:
  349. *
  350. * 1) not running
  351. * 2) executing
  352. * 3) latched
  353. *
  354. * When no NMI is in progress, it is in the "not running" state.
  355. * When an NMI comes in, it goes into the "executing" state.
  356. * Normally, if another NMI is triggered, it does not interrupt
  357. * the running NMI and the HW will simply latch it so that when
  358. * the first NMI finishes, it will restart the second NMI.
  359. * (Note, the latch is binary, thus multiple NMIs triggering,
  360. * when one is running, are ignored. Only one NMI is restarted.)
  361. *
  362. * If an NMI hits a breakpoint that executes an iret, another
  363. * NMI can preempt it. We do not want to allow this new NMI
  364. * to run, but we want to execute it when the first one finishes.
  365. * We set the state to "latched", and the exit of the first NMI will
  366. * perform a dec_return, if the result is zero (NOT_RUNNING), then
  367. * it will simply exit the NMI handler. If not, the dec_return
  368. * would have set the state to NMI_EXECUTING (what we want it to
  369. * be when we are running). In this case, we simply jump back
  370. * to rerun the NMI handler again, and restart the 'latched' NMI.
  371. *
  372. * No trap (breakpoint or page fault) should be hit before nmi_restart,
  373. * thus there is no race between the first check of state for NOT_RUNNING
  374. * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
  375. * at this point.
  376. *
  377. * In case the NMI takes a page fault, we need to save off the CR2
  378. * because the NMI could have preempted another page fault and corrupt
  379. * the CR2 that is about to be read. As nested NMIs must be restarted
  380. * and they can not take breakpoints or page faults, the update of the
  381. * CR2 must be done before converting the nmi state back to NOT_RUNNING.
  382. * Otherwise, there would be a race of another nested NMI coming in
  383. * after setting state to NOT_RUNNING but before updating the nmi_cr2.
  384. */
  385. enum nmi_states {
  386. NMI_NOT_RUNNING = 0,
  387. NMI_EXECUTING,
  388. NMI_LATCHED,
  389. };
  390. static DEFINE_PER_CPU(enum nmi_states, nmi_state);
  391. static DEFINE_PER_CPU(unsigned long, nmi_cr2);
  392. #define nmi_nesting_preprocess(regs) \
  393. do { \
  394. if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
  395. this_cpu_write(nmi_state, NMI_LATCHED); \
  396. return; \
  397. } \
  398. this_cpu_write(nmi_state, NMI_EXECUTING); \
  399. this_cpu_write(nmi_cr2, read_cr2()); \
  400. } while (0); \
  401. nmi_restart:
  402. #define nmi_nesting_postprocess() \
  403. do { \
  404. if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
  405. write_cr2(this_cpu_read(nmi_cr2)); \
  406. if (this_cpu_dec_return(nmi_state)) \
  407. goto nmi_restart; \
  408. } while (0)
  409. #else /* x86_64 */
  410. /*
  411. * In x86_64 things are a bit more difficult. This has the same problem
  412. * where an NMI hitting a breakpoint that calls iret will remove the
  413. * NMI context, allowing a nested NMI to enter. What makes this more
  414. * difficult is that both NMIs and breakpoints have their own stack.
  415. * When a new NMI or breakpoint is executed, the stack is set to a fixed
  416. * point. If an NMI is nested, it will have its stack set at that same
  417. * fixed address that the first NMI had, and will start corrupting the
  418. * stack. This is handled in entry_64.S, but the same problem exists with
  419. * the breakpoint stack.
  420. *
  421. * If a breakpoint is being processed, and the debug stack is being used,
  422. * if an NMI comes in and also hits a breakpoint, the stack pointer
  423. * will be set to the same fixed address as the breakpoint that was
  424. * interrupted, causing that stack to be corrupted. To handle this case,
  425. * check if the stack that was interrupted is the debug stack, and if
  426. * so, change the IDT so that new breakpoints will use the current stack
  427. * and not switch to the fixed address. On return of the NMI, switch back
  428. * to the original IDT.
  429. */
  430. static DEFINE_PER_CPU(int, update_debug_stack);
  431. static inline void nmi_nesting_preprocess(struct pt_regs *regs)
  432. {
  433. /*
  434. * If we interrupted a breakpoint, it is possible that
  435. * the nmi handler will have breakpoints too. We need to
  436. * change the IDT such that breakpoints that happen here
  437. * continue to use the NMI stack.
  438. */
  439. if (unlikely(is_debug_stack(regs->sp))) {
  440. debug_stack_set_zero();
  441. this_cpu_write(update_debug_stack, 1);
  442. }
  443. }
  444. static inline void nmi_nesting_postprocess(void)
  445. {
  446. if (unlikely(this_cpu_read(update_debug_stack))) {
  447. debug_stack_reset();
  448. this_cpu_write(update_debug_stack, 0);
  449. }
  450. }
  451. #endif
  452. dotraplinkage notrace __kprobes void
  453. do_nmi(struct pt_regs *regs, long error_code)
  454. {
  455. nmi_nesting_preprocess(regs);
  456. nmi_enter();
  457. inc_irq_stat(__nmi_count);
  458. if (!ignore_nmis)
  459. default_do_nmi(regs);
  460. nmi_exit();
  461. /* On i386, may loop back to preprocess */
  462. nmi_nesting_postprocess();
  463. }
  464. void stop_nmi(void)
  465. {
  466. ignore_nmis++;
  467. }
  468. void restart_nmi(void)
  469. {
  470. ignore_nmis--;
  471. }
  472. /* reset the back-to-back NMI logic */
  473. void local_touch_nmi(void)
  474. {
  475. __this_cpu_write(last_nmi_rip, 0);
  476. }
  477. EXPORT_SYMBOL_GPL(local_touch_nmi);