mce.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /*
  2. * Machine check exception handling.
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. *
  18. * Copyright 2013 IBM Corporation
  19. * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
  20. */
  21. #undef DEBUG
  22. #define pr_fmt(fmt) "mce: " fmt
  23. #include <linux/types.h>
  24. #include <linux/ptrace.h>
  25. #include <linux/percpu.h>
  26. #include <linux/export.h>
  27. #include <linux/irq_work.h>
  28. #include <asm/mce.h>
  29. static DEFINE_PER_CPU(int, mce_nest_count);
  30. static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
  31. /* Queue for delayed MCE events. */
  32. static DEFINE_PER_CPU(int, mce_queue_count);
  33. static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
  34. static void machine_check_process_queued_event(struct irq_work *work);
  35. static struct irq_work mce_event_process_work = {
  36. .func = machine_check_process_queued_event,
  37. };
  38. static void mce_set_error_info(struct machine_check_event *mce,
  39. struct mce_error_info *mce_err)
  40. {
  41. mce->error_type = mce_err->error_type;
  42. switch (mce_err->error_type) {
  43. case MCE_ERROR_TYPE_UE:
  44. mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
  45. break;
  46. case MCE_ERROR_TYPE_SLB:
  47. mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
  48. break;
  49. case MCE_ERROR_TYPE_ERAT:
  50. mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
  51. break;
  52. case MCE_ERROR_TYPE_TLB:
  53. mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
  54. break;
  55. case MCE_ERROR_TYPE_UNKNOWN:
  56. default:
  57. break;
  58. }
  59. }
  60. /*
  61. * Decode and save high level MCE information into per cpu buffer which
  62. * is an array of machine_check_event structure.
  63. */
  64. void save_mce_event(struct pt_regs *regs, long handled,
  65. struct mce_error_info *mce_err,
  66. uint64_t nip, uint64_t addr)
  67. {
  68. int index = __this_cpu_inc_return(mce_nest_count) - 1;
  69. struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
  70. /*
  71. * Return if we don't have enough space to log mce event.
  72. * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
  73. * the check below will stop buffer overrun.
  74. */
  75. if (index >= MAX_MC_EVT)
  76. return;
  77. /* Populate generic machine check info */
  78. mce->version = MCE_V1;
  79. mce->srr0 = nip;
  80. mce->srr1 = regs->msr;
  81. mce->gpr3 = regs->gpr[3];
  82. mce->in_use = 1;
  83. mce->initiator = MCE_INITIATOR_CPU;
  84. /* Mark it recovered if we have handled it and MSR(RI=1). */
  85. if (handled && (regs->msr & MSR_RI))
  86. mce->disposition = MCE_DISPOSITION_RECOVERED;
  87. else
  88. mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
  89. mce->severity = MCE_SEV_ERROR_SYNC;
  90. /*
  91. * Populate the mce error_type and type-specific error_type.
  92. */
  93. mce_set_error_info(mce, mce_err);
  94. if (!addr)
  95. return;
  96. if (mce->error_type == MCE_ERROR_TYPE_TLB) {
  97. mce->u.tlb_error.effective_address_provided = true;
  98. mce->u.tlb_error.effective_address = addr;
  99. } else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
  100. mce->u.slb_error.effective_address_provided = true;
  101. mce->u.slb_error.effective_address = addr;
  102. } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
  103. mce->u.erat_error.effective_address_provided = true;
  104. mce->u.erat_error.effective_address = addr;
  105. } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
  106. mce->u.ue_error.effective_address_provided = true;
  107. mce->u.ue_error.effective_address = addr;
  108. }
  109. return;
  110. }
  111. /*
  112. * get_mce_event:
  113. * mce Pointer to machine_check_event structure to be filled.
  114. * release Flag to indicate whether to free the event slot or not.
  115. * 0 <= do not release the mce event. Caller will invoke
  116. * release_mce_event() once event has been consumed.
  117. * 1 <= release the slot.
  118. *
  119. * return 1 = success
  120. * 0 = failure
  121. *
  122. * get_mce_event() will be called by platform specific machine check
  123. * handle routine and in KVM.
  124. * When we call get_mce_event(), we are still in interrupt context and
  125. * preemption will not be scheduled until ret_from_expect() routine
  126. * is called.
  127. */
  128. int get_mce_event(struct machine_check_event *mce, bool release)
  129. {
  130. int index = __this_cpu_read(mce_nest_count) - 1;
  131. struct machine_check_event *mc_evt;
  132. int ret = 0;
  133. /* Sanity check */
  134. if (index < 0)
  135. return ret;
  136. /* Check if we have MCE info to process. */
  137. if (index < MAX_MC_EVT) {
  138. mc_evt = this_cpu_ptr(&mce_event[index]);
  139. /* Copy the event structure and release the original */
  140. if (mce)
  141. *mce = *mc_evt;
  142. if (release)
  143. mc_evt->in_use = 0;
  144. ret = 1;
  145. }
  146. /* Decrement the count to free the slot. */
  147. if (release)
  148. __this_cpu_dec(mce_nest_count);
  149. return ret;
  150. }
  151. void release_mce_event(void)
  152. {
  153. get_mce_event(NULL, true);
  154. }
  155. /*
  156. * Queue up the MCE event which then can be handled later.
  157. */
  158. void machine_check_queue_event(void)
  159. {
  160. int index;
  161. struct machine_check_event evt;
  162. if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
  163. return;
  164. index = __this_cpu_inc_return(mce_queue_count) - 1;
  165. /* If queue is full, just return for now. */
  166. if (index >= MAX_MC_EVT) {
  167. __this_cpu_dec(mce_queue_count);
  168. return;
  169. }
  170. memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
  171. /* Queue irq work to process this event later. */
  172. irq_work_queue(&mce_event_process_work);
  173. }
  174. /*
  175. * process pending MCE event from the mce event queue. This function will be
  176. * called during syscall exit.
  177. */
  178. static void machine_check_process_queued_event(struct irq_work *work)
  179. {
  180. int index;
  181. /*
  182. * For now just print it to console.
  183. * TODO: log this error event to FSP or nvram.
  184. */
  185. while (__this_cpu_read(mce_queue_count) > 0) {
  186. index = __this_cpu_read(mce_queue_count) - 1;
  187. machine_check_print_event_info(
  188. this_cpu_ptr(&mce_event_queue[index]));
  189. __this_cpu_dec(mce_queue_count);
  190. }
  191. }
  192. void machine_check_print_event_info(struct machine_check_event *evt)
  193. {
  194. const char *level, *sevstr, *subtype;
  195. static const char *mc_ue_types[] = {
  196. "Indeterminate",
  197. "Instruction fetch",
  198. "Page table walk ifetch",
  199. "Load/Store",
  200. "Page table walk Load/Store",
  201. };
  202. static const char *mc_slb_types[] = {
  203. "Indeterminate",
  204. "Parity",
  205. "Multihit",
  206. };
  207. static const char *mc_erat_types[] = {
  208. "Indeterminate",
  209. "Parity",
  210. "Multihit",
  211. };
  212. static const char *mc_tlb_types[] = {
  213. "Indeterminate",
  214. "Parity",
  215. "Multihit",
  216. };
  217. /* Print things out */
  218. if (evt->version != MCE_V1) {
  219. pr_err("Machine Check Exception, Unknown event version %d !\n",
  220. evt->version);
  221. return;
  222. }
  223. switch (evt->severity) {
  224. case MCE_SEV_NO_ERROR:
  225. level = KERN_INFO;
  226. sevstr = "Harmless";
  227. break;
  228. case MCE_SEV_WARNING:
  229. level = KERN_WARNING;
  230. sevstr = "";
  231. break;
  232. case MCE_SEV_ERROR_SYNC:
  233. level = KERN_ERR;
  234. sevstr = "Severe";
  235. break;
  236. case MCE_SEV_FATAL:
  237. default:
  238. level = KERN_ERR;
  239. sevstr = "Fatal";
  240. break;
  241. }
  242. printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
  243. evt->disposition == MCE_DISPOSITION_RECOVERED ?
  244. "Recovered" : "[Not recovered");
  245. printk("%s Initiator: %s\n", level,
  246. evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
  247. switch (evt->error_type) {
  248. case MCE_ERROR_TYPE_UE:
  249. subtype = evt->u.ue_error.ue_error_type <
  250. ARRAY_SIZE(mc_ue_types) ?
  251. mc_ue_types[evt->u.ue_error.ue_error_type]
  252. : "Unknown";
  253. printk("%s Error type: UE [%s]\n", level, subtype);
  254. if (evt->u.ue_error.effective_address_provided)
  255. printk("%s Effective address: %016llx\n",
  256. level, evt->u.ue_error.effective_address);
  257. if (evt->u.ue_error.physical_address_provided)
  258. printk("%s Physical address: %016llx\n",
  259. level, evt->u.ue_error.physical_address);
  260. break;
  261. case MCE_ERROR_TYPE_SLB:
  262. subtype = evt->u.slb_error.slb_error_type <
  263. ARRAY_SIZE(mc_slb_types) ?
  264. mc_slb_types[evt->u.slb_error.slb_error_type]
  265. : "Unknown";
  266. printk("%s Error type: SLB [%s]\n", level, subtype);
  267. if (evt->u.slb_error.effective_address_provided)
  268. printk("%s Effective address: %016llx\n",
  269. level, evt->u.slb_error.effective_address);
  270. break;
  271. case MCE_ERROR_TYPE_ERAT:
  272. subtype = evt->u.erat_error.erat_error_type <
  273. ARRAY_SIZE(mc_erat_types) ?
  274. mc_erat_types[evt->u.erat_error.erat_error_type]
  275. : "Unknown";
  276. printk("%s Error type: ERAT [%s]\n", level, subtype);
  277. if (evt->u.erat_error.effective_address_provided)
  278. printk("%s Effective address: %016llx\n",
  279. level, evt->u.erat_error.effective_address);
  280. break;
  281. case MCE_ERROR_TYPE_TLB:
  282. subtype = evt->u.tlb_error.tlb_error_type <
  283. ARRAY_SIZE(mc_tlb_types) ?
  284. mc_tlb_types[evt->u.tlb_error.tlb_error_type]
  285. : "Unknown";
  286. printk("%s Error type: TLB [%s]\n", level, subtype);
  287. if (evt->u.tlb_error.effective_address_provided)
  288. printk("%s Effective address: %016llx\n",
  289. level, evt->u.tlb_error.effective_address);
  290. break;
  291. default:
  292. case MCE_ERROR_TYPE_UNKNOWN:
  293. printk("%s Error type: Unknown\n", level);
  294. break;
  295. }
  296. }
  297. uint64_t get_mce_fault_addr(struct machine_check_event *evt)
  298. {
  299. switch (evt->error_type) {
  300. case MCE_ERROR_TYPE_UE:
  301. if (evt->u.ue_error.effective_address_provided)
  302. return evt->u.ue_error.effective_address;
  303. break;
  304. case MCE_ERROR_TYPE_SLB:
  305. if (evt->u.slb_error.effective_address_provided)
  306. return evt->u.slb_error.effective_address;
  307. break;
  308. case MCE_ERROR_TYPE_ERAT:
  309. if (evt->u.erat_error.effective_address_provided)
  310. return evt->u.erat_error.effective_address;
  311. break;
  312. case MCE_ERROR_TYPE_TLB:
  313. if (evt->u.tlb_error.effective_address_provided)
  314. return evt->u.tlb_error.effective_address;
  315. break;
  316. default:
  317. case MCE_ERROR_TYPE_UNKNOWN:
  318. break;
  319. }
  320. return 0;
  321. }
  322. EXPORT_SYMBOL(get_mce_fault_addr);