mce.c 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407
  1. /*
  2. * Machine check handler.
  3. *
  4. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5. * Rest from unknown author(s).
  6. * 2004 Andi Kleen. Rewrote most of it.
  7. * Copyright 2008 Intel Corporation
  8. * Author: Andi Kleen
  9. */
  10. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11. #include <linux/thread_info.h>
  12. #include <linux/capability.h>
  13. #include <linux/miscdevice.h>
  14. #include <linux/ratelimit.h>
  15. #include <linux/kallsyms.h>
  16. #include <linux/rcupdate.h>
  17. #include <linux/kobject.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/kdebug.h>
  20. #include <linux/kernel.h>
  21. #include <linux/percpu.h>
  22. #include <linux/string.h>
  23. #include <linux/device.h>
  24. #include <linux/syscore_ops.h>
  25. #include <linux/delay.h>
  26. #include <linux/ctype.h>
  27. #include <linux/sched.h>
  28. #include <linux/sysfs.h>
  29. #include <linux/types.h>
  30. #include <linux/slab.h>
  31. #include <linux/init.h>
  32. #include <linux/kmod.h>
  33. #include <linux/poll.h>
  34. #include <linux/nmi.h>
  35. #include <linux/cpu.h>
  36. #include <linux/ras.h>
  37. #include <linux/smp.h>
  38. #include <linux/fs.h>
  39. #include <linux/mm.h>
  40. #include <linux/debugfs.h>
  41. #include <linux/irq_work.h>
  42. #include <linux/export.h>
  43. #include <linux/jump_label.h>
  44. #include <asm/intel-family.h>
  45. #include <asm/processor.h>
  46. #include <asm/traps.h>
  47. #include <asm/tlbflush.h>
  48. #include <asm/mce.h>
  49. #include <asm/msr.h>
  50. #include <asm/reboot.h>
  51. #include "mce-internal.h"
  52. static DEFINE_MUTEX(mce_log_mutex);
  53. #define CREATE_TRACE_POINTS
  54. #include <trace/events/mce.h>
  55. #define SPINUNIT 100 /* 100ns */
  56. DEFINE_PER_CPU(unsigned, mce_exception_count);
  57. struct mce_bank *mce_banks __read_mostly;
  58. struct mce_vendor_flags mce_flags __read_mostly;
  59. struct mca_config mca_cfg __read_mostly = {
  60. .bootlog = -1,
  61. /*
  62. * Tolerant levels:
  63. * 0: always panic on uncorrected errors, log corrected errors
  64. * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  65. * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  66. * 3: never panic or SIGBUS, log all errors (for testing only)
  67. */
  68. .tolerant = 1,
  69. .monarch_timeout = -1
  70. };
  71. static DEFINE_PER_CPU(struct mce, mces_seen);
  72. static unsigned long mce_need_notify;
  73. static int cpu_missing;
  74. /*
  75. * MCA banks polled by the period polling timer for corrected events.
  76. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  77. */
  78. DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  79. [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  80. };
  81. /*
  82. * MCA banks controlled through firmware first for corrected errors.
  83. * This is a global list of banks for which we won't enable CMCI and we
  84. * won't poll. Firmware controls these banks and is responsible for
  85. * reporting corrected errors through GHES. Uncorrected/recoverable
  86. * errors are still notified through a machine check.
  87. */
  88. mce_banks_t mce_banks_ce_disabled;
  89. static struct work_struct mce_work;
  90. static struct irq_work mce_irq_work;
  91. static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  92. /*
  93. * CPU/chipset specific EDAC code can register a notifier call here to print
  94. * MCE errors in a human-readable form.
  95. */
  96. BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
  97. /* Do initial initialization of a struct mce */
  98. void mce_setup(struct mce *m)
  99. {
  100. memset(m, 0, sizeof(struct mce));
  101. m->cpu = m->extcpu = smp_processor_id();
  102. /* We hope get_seconds stays lockless */
  103. m->time = get_seconds();
  104. m->cpuvendor = boot_cpu_data.x86_vendor;
  105. m->cpuid = cpuid_eax(1);
  106. m->socketid = cpu_data(m->extcpu).phys_proc_id;
  107. m->apicid = cpu_data(m->extcpu).initial_apicid;
  108. rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
  109. if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
  110. rdmsrl(MSR_PPIN, m->ppin);
  111. }
  112. DEFINE_PER_CPU(struct mce, injectm);
  113. EXPORT_PER_CPU_SYMBOL_GPL(injectm);
  114. void mce_log(struct mce *m)
  115. {
  116. if (!mce_gen_pool_add(m))
  117. irq_work_queue(&mce_irq_work);
  118. }
  119. void mce_inject_log(struct mce *m)
  120. {
  121. mutex_lock(&mce_log_mutex);
  122. mce_log(m);
  123. mutex_unlock(&mce_log_mutex);
  124. }
  125. EXPORT_SYMBOL_GPL(mce_inject_log);
  126. static struct notifier_block mce_srao_nb;
  127. /*
  128. * We run the default notifier if we have only the SRAO, the first and the
  129. * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
  130. * notifiers registered on the chain.
  131. */
  132. #define NUM_DEFAULT_NOTIFIERS 3
  133. static atomic_t num_notifiers;
  134. void mce_register_decode_chain(struct notifier_block *nb)
  135. {
  136. if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
  137. return;
  138. atomic_inc(&num_notifiers);
  139. blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
  140. }
  141. EXPORT_SYMBOL_GPL(mce_register_decode_chain);
  142. void mce_unregister_decode_chain(struct notifier_block *nb)
  143. {
  144. atomic_dec(&num_notifiers);
  145. blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
  146. }
  147. EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
  148. static inline u32 ctl_reg(int bank)
  149. {
  150. return MSR_IA32_MCx_CTL(bank);
  151. }
  152. static inline u32 status_reg(int bank)
  153. {
  154. return MSR_IA32_MCx_STATUS(bank);
  155. }
  156. static inline u32 addr_reg(int bank)
  157. {
  158. return MSR_IA32_MCx_ADDR(bank);
  159. }
  160. static inline u32 misc_reg(int bank)
  161. {
  162. return MSR_IA32_MCx_MISC(bank);
  163. }
  164. static inline u32 smca_ctl_reg(int bank)
  165. {
  166. return MSR_AMD64_SMCA_MCx_CTL(bank);
  167. }
  168. static inline u32 smca_status_reg(int bank)
  169. {
  170. return MSR_AMD64_SMCA_MCx_STATUS(bank);
  171. }
  172. static inline u32 smca_addr_reg(int bank)
  173. {
  174. return MSR_AMD64_SMCA_MCx_ADDR(bank);
  175. }
  176. static inline u32 smca_misc_reg(int bank)
  177. {
  178. return MSR_AMD64_SMCA_MCx_MISC(bank);
  179. }
  180. struct mca_msr_regs msr_ops = {
  181. .ctl = ctl_reg,
  182. .status = status_reg,
  183. .addr = addr_reg,
  184. .misc = misc_reg
  185. };
  186. static void __print_mce(struct mce *m)
  187. {
  188. pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
  189. m->extcpu,
  190. (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
  191. m->mcgstatus, m->bank, m->status);
  192. if (m->ip) {
  193. pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
  194. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  195. m->cs, m->ip);
  196. if (m->cs == __KERNEL_CS)
  197. print_symbol("{%s}", m->ip);
  198. pr_cont("\n");
  199. }
  200. pr_emerg(HW_ERR "TSC %llx ", m->tsc);
  201. if (m->addr)
  202. pr_cont("ADDR %llx ", m->addr);
  203. if (m->misc)
  204. pr_cont("MISC %llx ", m->misc);
  205. if (mce_flags.smca) {
  206. if (m->synd)
  207. pr_cont("SYND %llx ", m->synd);
  208. if (m->ipid)
  209. pr_cont("IPID %llx ", m->ipid);
  210. }
  211. pr_cont("\n");
  212. /*
  213. * Note this output is parsed by external tools and old fields
  214. * should not be changed.
  215. */
  216. pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
  217. m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
  218. cpu_data(m->extcpu).microcode);
  219. }
  220. static void print_mce(struct mce *m)
  221. {
  222. __print_mce(m);
  223. pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
  224. }
  225. #define PANIC_TIMEOUT 5 /* 5 seconds */
  226. static atomic_t mce_panicked;
  227. static int fake_panic;
  228. static atomic_t mce_fake_panicked;
  229. /* Panic in progress. Enable interrupts and wait for final IPI */
  230. static void wait_for_panic(void)
  231. {
  232. long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
  233. preempt_disable();
  234. local_irq_enable();
  235. while (timeout-- > 0)
  236. udelay(1);
  237. if (panic_timeout == 0)
  238. panic_timeout = mca_cfg.panic_timeout;
  239. panic("Panicing machine check CPU died");
  240. }
  241. static void mce_panic(const char *msg, struct mce *final, char *exp)
  242. {
  243. int apei_err = 0;
  244. struct llist_node *pending;
  245. struct mce_evt_llist *l;
  246. if (!fake_panic) {
  247. /*
  248. * Make sure only one CPU runs in machine check panic
  249. */
  250. if (atomic_inc_return(&mce_panicked) > 1)
  251. wait_for_panic();
  252. barrier();
  253. bust_spinlocks(1);
  254. console_verbose();
  255. } else {
  256. /* Don't log too much for fake panic */
  257. if (atomic_inc_return(&mce_fake_panicked) > 1)
  258. return;
  259. }
  260. pending = mce_gen_pool_prepare_records();
  261. /* First print corrected ones that are still unlogged */
  262. llist_for_each_entry(l, pending, llnode) {
  263. struct mce *m = &l->mce;
  264. if (!(m->status & MCI_STATUS_UC)) {
  265. print_mce(m);
  266. if (!apei_err)
  267. apei_err = apei_write_mce(m);
  268. }
  269. }
  270. /* Now print uncorrected but with the final one last */
  271. llist_for_each_entry(l, pending, llnode) {
  272. struct mce *m = &l->mce;
  273. if (!(m->status & MCI_STATUS_UC))
  274. continue;
  275. if (!final || mce_cmp(m, final)) {
  276. print_mce(m);
  277. if (!apei_err)
  278. apei_err = apei_write_mce(m);
  279. }
  280. }
  281. if (final) {
  282. print_mce(final);
  283. if (!apei_err)
  284. apei_err = apei_write_mce(final);
  285. }
  286. if (cpu_missing)
  287. pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
  288. if (exp)
  289. pr_emerg(HW_ERR "Machine check: %s\n", exp);
  290. if (!fake_panic) {
  291. if (panic_timeout == 0)
  292. panic_timeout = mca_cfg.panic_timeout;
  293. panic(msg);
  294. } else
  295. pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
  296. }
  297. /* Support code for software error injection */
  298. static int msr_to_offset(u32 msr)
  299. {
  300. unsigned bank = __this_cpu_read(injectm.bank);
  301. if (msr == mca_cfg.rip_msr)
  302. return offsetof(struct mce, ip);
  303. if (msr == msr_ops.status(bank))
  304. return offsetof(struct mce, status);
  305. if (msr == msr_ops.addr(bank))
  306. return offsetof(struct mce, addr);
  307. if (msr == msr_ops.misc(bank))
  308. return offsetof(struct mce, misc);
  309. if (msr == MSR_IA32_MCG_STATUS)
  310. return offsetof(struct mce, mcgstatus);
  311. return -1;
  312. }
  313. /* MSR access wrappers used for error injection */
  314. static u64 mce_rdmsrl(u32 msr)
  315. {
  316. u64 v;
  317. if (__this_cpu_read(injectm.finished)) {
  318. int offset = msr_to_offset(msr);
  319. if (offset < 0)
  320. return 0;
  321. return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
  322. }
  323. if (rdmsrl_safe(msr, &v)) {
  324. WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
  325. /*
  326. * Return zero in case the access faulted. This should
  327. * not happen normally but can happen if the CPU does
  328. * something weird, or if the code is buggy.
  329. */
  330. v = 0;
  331. }
  332. return v;
  333. }
  334. static void mce_wrmsrl(u32 msr, u64 v)
  335. {
  336. if (__this_cpu_read(injectm.finished)) {
  337. int offset = msr_to_offset(msr);
  338. if (offset >= 0)
  339. *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
  340. return;
  341. }
  342. wrmsrl(msr, v);
  343. }
  344. /*
  345. * Collect all global (w.r.t. this processor) status about this machine
  346. * check into our "mce" struct so that we can use it later to assess
  347. * the severity of the problem as we read per-bank specific details.
  348. */
  349. static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
  350. {
  351. mce_setup(m);
  352. m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
  353. if (regs) {
  354. /*
  355. * Get the address of the instruction at the time of
  356. * the machine check error.
  357. */
  358. if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
  359. m->ip = regs->ip;
  360. m->cs = regs->cs;
  361. /*
  362. * When in VM86 mode make the cs look like ring 3
  363. * always. This is a lie, but it's better than passing
  364. * the additional vm86 bit around everywhere.
  365. */
  366. if (v8086_mode(regs))
  367. m->cs |= 3;
  368. }
  369. /* Use accurate RIP reporting if available. */
  370. if (mca_cfg.rip_msr)
  371. m->ip = mce_rdmsrl(mca_cfg.rip_msr);
  372. }
  373. }
  374. int mce_available(struct cpuinfo_x86 *c)
  375. {
  376. if (mca_cfg.disabled)
  377. return 0;
  378. return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
  379. }
  380. static void mce_schedule_work(void)
  381. {
  382. if (!mce_gen_pool_empty())
  383. schedule_work(&mce_work);
  384. }
  385. static void mce_irq_work_cb(struct irq_work *entry)
  386. {
  387. mce_schedule_work();
  388. }
  389. static void mce_report_event(struct pt_regs *regs)
  390. {
  391. if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
  392. mce_notify_irq();
  393. /*
  394. * Triggering the work queue here is just an insurance
  395. * policy in case the syscall exit notify handler
  396. * doesn't run soon enough or ends up running on the
  397. * wrong CPU (can happen when audit sleeps)
  398. */
  399. mce_schedule_work();
  400. return;
  401. }
  402. irq_work_queue(&mce_irq_work);
  403. }
  404. /*
  405. * Check if the address reported by the CPU is in a format we can parse.
  406. * It would be possible to add code for most other cases, but all would
  407. * be somewhat complicated (e.g. segment offset would require an instruction
  408. * parser). So only support physical addresses up to page granuality for now.
  409. */
  410. static int mce_usable_address(struct mce *m)
  411. {
  412. if (!(m->status & MCI_STATUS_ADDRV))
  413. return 0;
  414. /* Checks after this one are Intel-specific: */
  415. if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
  416. return 1;
  417. if (!(m->status & MCI_STATUS_MISCV))
  418. return 0;
  419. if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
  420. return 0;
  421. if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
  422. return 0;
  423. return 1;
  424. }
  425. bool mce_is_memory_error(struct mce *m)
  426. {
  427. if (m->cpuvendor == X86_VENDOR_AMD) {
  428. /* ErrCodeExt[20:16] */
  429. u8 xec = (m->status >> 16) & 0x1f;
  430. return (xec == 0x0 || xec == 0x8);
  431. } else if (m->cpuvendor == X86_VENDOR_INTEL) {
  432. /*
  433. * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
  434. *
  435. * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
  436. * indicating a memory error. Bit 8 is used for indicating a
  437. * cache hierarchy error. The combination of bit 2 and bit 3
  438. * is used for indicating a `generic' cache hierarchy error
  439. * But we can't just blindly check the above bits, because if
  440. * bit 11 is set, then it is a bus/interconnect error - and
  441. * either way the above bits just gives more detail on what
  442. * bus/interconnect error happened. Note that bit 12 can be
  443. * ignored, as it's the "filter" bit.
  444. */
  445. return (m->status & 0xef80) == BIT(7) ||
  446. (m->status & 0xef00) == BIT(8) ||
  447. (m->status & 0xeffc) == 0xc;
  448. }
  449. return false;
  450. }
  451. EXPORT_SYMBOL_GPL(mce_is_memory_error);
  452. static bool cec_add_mce(struct mce *m)
  453. {
  454. if (!m)
  455. return false;
  456. /* We eat only correctable DRAM errors with usable addresses. */
  457. if (mce_is_memory_error(m) &&
  458. !(m->status & MCI_STATUS_UC) &&
  459. mce_usable_address(m))
  460. if (!cec_add_elem(m->addr >> PAGE_SHIFT))
  461. return true;
  462. return false;
  463. }
  464. static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
  465. void *data)
  466. {
  467. struct mce *m = (struct mce *)data;
  468. if (!m)
  469. return NOTIFY_DONE;
  470. if (cec_add_mce(m))
  471. return NOTIFY_STOP;
  472. /* Emit the trace record: */
  473. trace_mce_record(m);
  474. set_bit(0, &mce_need_notify);
  475. mce_notify_irq();
  476. return NOTIFY_DONE;
  477. }
  478. static struct notifier_block first_nb = {
  479. .notifier_call = mce_first_notifier,
  480. .priority = MCE_PRIO_FIRST,
  481. };
  482. static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
  483. void *data)
  484. {
  485. struct mce *mce = (struct mce *)data;
  486. unsigned long pfn;
  487. if (!mce)
  488. return NOTIFY_DONE;
  489. if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
  490. pfn = mce->addr >> PAGE_SHIFT;
  491. memory_failure(pfn, MCE_VECTOR, 0);
  492. }
  493. return NOTIFY_OK;
  494. }
  495. static struct notifier_block mce_srao_nb = {
  496. .notifier_call = srao_decode_notifier,
  497. .priority = MCE_PRIO_SRAO,
  498. };
  499. static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
  500. void *data)
  501. {
  502. struct mce *m = (struct mce *)data;
  503. if (!m)
  504. return NOTIFY_DONE;
  505. if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
  506. return NOTIFY_DONE;
  507. __print_mce(m);
  508. return NOTIFY_DONE;
  509. }
  510. static struct notifier_block mce_default_nb = {
  511. .notifier_call = mce_default_notifier,
  512. /* lowest prio, we want it to run last. */
  513. .priority = MCE_PRIO_LOWEST,
  514. };
  515. /*
  516. * Read ADDR and MISC registers.
  517. */
  518. static void mce_read_aux(struct mce *m, int i)
  519. {
  520. if (m->status & MCI_STATUS_MISCV)
  521. m->misc = mce_rdmsrl(msr_ops.misc(i));
  522. if (m->status & MCI_STATUS_ADDRV) {
  523. m->addr = mce_rdmsrl(msr_ops.addr(i));
  524. /*
  525. * Mask the reported address by the reported granularity.
  526. */
  527. if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
  528. u8 shift = MCI_MISC_ADDR_LSB(m->misc);
  529. m->addr >>= shift;
  530. m->addr <<= shift;
  531. }
  532. /*
  533. * Extract [55:<lsb>] where lsb is the least significant
  534. * *valid* bit of the address bits.
  535. */
  536. if (mce_flags.smca) {
  537. u8 lsb = (m->addr >> 56) & 0x3f;
  538. m->addr &= GENMASK_ULL(55, lsb);
  539. }
  540. }
  541. if (mce_flags.smca) {
  542. m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
  543. if (m->status & MCI_STATUS_SYNDV)
  544. m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
  545. }
  546. }
  547. DEFINE_PER_CPU(unsigned, mce_poll_count);
  548. /*
  549. * Poll for corrected events or events that happened before reset.
  550. * Those are just logged through /dev/mcelog.
  551. *
  552. * This is executed in standard interrupt context.
  553. *
  554. * Note: spec recommends to panic for fatal unsignalled
  555. * errors here. However this would be quite problematic --
  556. * we would need to reimplement the Monarch handling and
  557. * it would mess up the exclusion between exception handler
  558. * and poll hander -- * so we skip this for now.
  559. * These cases should not happen anyways, or only when the CPU
  560. * is already totally * confused. In this case it's likely it will
  561. * not fully execute the machine check handler either.
  562. */
  563. bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  564. {
  565. bool error_seen = false;
  566. struct mce m;
  567. int severity;
  568. int i;
  569. this_cpu_inc(mce_poll_count);
  570. mce_gather_info(&m, NULL);
  571. if (flags & MCP_TIMESTAMP)
  572. m.tsc = rdtsc();
  573. for (i = 0; i < mca_cfg.banks; i++) {
  574. if (!mce_banks[i].ctl || !test_bit(i, *b))
  575. continue;
  576. m.misc = 0;
  577. m.addr = 0;
  578. m.bank = i;
  579. barrier();
  580. m.status = mce_rdmsrl(msr_ops.status(i));
  581. if (!(m.status & MCI_STATUS_VAL))
  582. continue;
  583. /*
  584. * Uncorrected or signalled events are handled by the exception
  585. * handler when it is enabled, so don't process those here.
  586. *
  587. * TBD do the same check for MCI_STATUS_EN here?
  588. */
  589. if (!(flags & MCP_UC) &&
  590. (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
  591. continue;
  592. error_seen = true;
  593. mce_read_aux(&m, i);
  594. severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
  595. if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
  596. if (m.status & MCI_STATUS_ADDRV)
  597. m.severity = severity;
  598. /*
  599. * Don't get the IP here because it's unlikely to
  600. * have anything to do with the actual error location.
  601. */
  602. if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
  603. mce_log(&m);
  604. else if (mce_usable_address(&m)) {
  605. /*
  606. * Although we skipped logging this, we still want
  607. * to take action. Add to the pool so the registered
  608. * notifiers will see it.
  609. */
  610. if (!mce_gen_pool_add(&m))
  611. mce_schedule_work();
  612. }
  613. /*
  614. * Clear state for this bank.
  615. */
  616. mce_wrmsrl(msr_ops.status(i), 0);
  617. }
  618. /*
  619. * Don't clear MCG_STATUS here because it's only defined for
  620. * exceptions.
  621. */
  622. sync_core();
  623. return error_seen;
  624. }
  625. EXPORT_SYMBOL_GPL(machine_check_poll);
  626. /*
  627. * Do a quick check if any of the events requires a panic.
  628. * This decides if we keep the events around or clear them.
  629. */
  630. static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
  631. struct pt_regs *regs)
  632. {
  633. int i, ret = 0;
  634. char *tmp;
  635. for (i = 0; i < mca_cfg.banks; i++) {
  636. m->status = mce_rdmsrl(msr_ops.status(i));
  637. if (m->status & MCI_STATUS_VAL) {
  638. __set_bit(i, validp);
  639. if (quirk_no_way_out)
  640. quirk_no_way_out(i, m, regs);
  641. }
  642. if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
  643. *msg = tmp;
  644. ret = 1;
  645. }
  646. }
  647. return ret;
  648. }
  649. /*
  650. * Variable to establish order between CPUs while scanning.
  651. * Each CPU spins initially until executing is equal its number.
  652. */
  653. static atomic_t mce_executing;
  654. /*
  655. * Defines order of CPUs on entry. First CPU becomes Monarch.
  656. */
  657. static atomic_t mce_callin;
  658. /*
  659. * Check if a timeout waiting for other CPUs happened.
  660. */
  661. static int mce_timed_out(u64 *t, const char *msg)
  662. {
  663. /*
  664. * The others already did panic for some reason.
  665. * Bail out like in a timeout.
  666. * rmb() to tell the compiler that system_state
  667. * might have been modified by someone else.
  668. */
  669. rmb();
  670. if (atomic_read(&mce_panicked))
  671. wait_for_panic();
  672. if (!mca_cfg.monarch_timeout)
  673. goto out;
  674. if ((s64)*t < SPINUNIT) {
  675. if (mca_cfg.tolerant <= 1)
  676. mce_panic(msg, NULL, NULL);
  677. cpu_missing = 1;
  678. return 1;
  679. }
  680. *t -= SPINUNIT;
  681. out:
  682. touch_nmi_watchdog();
  683. return 0;
  684. }
  685. /*
  686. * The Monarch's reign. The Monarch is the CPU who entered
  687. * the machine check handler first. It waits for the others to
  688. * raise the exception too and then grades them. When any
  689. * error is fatal panic. Only then let the others continue.
  690. *
  691. * The other CPUs entering the MCE handler will be controlled by the
  692. * Monarch. They are called Subjects.
  693. *
  694. * This way we prevent any potential data corruption in a unrecoverable case
  695. * and also makes sure always all CPU's errors are examined.
  696. *
  697. * Also this detects the case of a machine check event coming from outer
  698. * space (not detected by any CPUs) In this case some external agent wants
  699. * us to shut down, so panic too.
  700. *
  701. * The other CPUs might still decide to panic if the handler happens
  702. * in a unrecoverable place, but in this case the system is in a semi-stable
  703. * state and won't corrupt anything by itself. It's ok to let the others
  704. * continue for a bit first.
  705. *
  706. * All the spin loops have timeouts; when a timeout happens a CPU
  707. * typically elects itself to be Monarch.
  708. */
  709. static void mce_reign(void)
  710. {
  711. int cpu;
  712. struct mce *m = NULL;
  713. int global_worst = 0;
  714. char *msg = NULL;
  715. char *nmsg = NULL;
  716. /*
  717. * This CPU is the Monarch and the other CPUs have run
  718. * through their handlers.
  719. * Grade the severity of the errors of all the CPUs.
  720. */
  721. for_each_possible_cpu(cpu) {
  722. int severity = mce_severity(&per_cpu(mces_seen, cpu),
  723. mca_cfg.tolerant,
  724. &nmsg, true);
  725. if (severity > global_worst) {
  726. msg = nmsg;
  727. global_worst = severity;
  728. m = &per_cpu(mces_seen, cpu);
  729. }
  730. }
  731. /*
  732. * Cannot recover? Panic here then.
  733. * This dumps all the mces in the log buffer and stops the
  734. * other CPUs.
  735. */
  736. if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  737. mce_panic("Fatal machine check", m, msg);
  738. /*
  739. * For UC somewhere we let the CPU who detects it handle it.
  740. * Also must let continue the others, otherwise the handling
  741. * CPU could deadlock on a lock.
  742. */
  743. /*
  744. * No machine check event found. Must be some external
  745. * source or one CPU is hung. Panic.
  746. */
  747. if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
  748. mce_panic("Fatal machine check from unknown source", NULL, NULL);
  749. /*
  750. * Now clear all the mces_seen so that they don't reappear on
  751. * the next mce.
  752. */
  753. for_each_possible_cpu(cpu)
  754. memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
  755. }
  756. static atomic_t global_nwo;
  757. /*
  758. * Start of Monarch synchronization. This waits until all CPUs have
  759. * entered the exception handler and then determines if any of them
  760. * saw a fatal event that requires panic. Then it executes them
  761. * in the entry order.
  762. * TBD double check parallel CPU hotunplug
  763. */
  764. static int mce_start(int *no_way_out)
  765. {
  766. int order;
  767. int cpus = num_online_cpus();
  768. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  769. if (!timeout)
  770. return -1;
  771. atomic_add(*no_way_out, &global_nwo);
  772. /*
  773. * Rely on the implied barrier below, such that global_nwo
  774. * is updated before mce_callin.
  775. */
  776. order = atomic_inc_return(&mce_callin);
  777. /*
  778. * Wait for everyone.
  779. */
  780. while (atomic_read(&mce_callin) != cpus) {
  781. if (mce_timed_out(&timeout,
  782. "Timeout: Not all CPUs entered broadcast exception handler")) {
  783. atomic_set(&global_nwo, 0);
  784. return -1;
  785. }
  786. ndelay(SPINUNIT);
  787. }
  788. /*
  789. * mce_callin should be read before global_nwo
  790. */
  791. smp_rmb();
  792. if (order == 1) {
  793. /*
  794. * Monarch: Starts executing now, the others wait.
  795. */
  796. atomic_set(&mce_executing, 1);
  797. } else {
  798. /*
  799. * Subject: Now start the scanning loop one by one in
  800. * the original callin order.
  801. * This way when there are any shared banks it will be
  802. * only seen by one CPU before cleared, avoiding duplicates.
  803. */
  804. while (atomic_read(&mce_executing) < order) {
  805. if (mce_timed_out(&timeout,
  806. "Timeout: Subject CPUs unable to finish machine check processing")) {
  807. atomic_set(&global_nwo, 0);
  808. return -1;
  809. }
  810. ndelay(SPINUNIT);
  811. }
  812. }
  813. /*
  814. * Cache the global no_way_out state.
  815. */
  816. *no_way_out = atomic_read(&global_nwo);
  817. return order;
  818. }
  819. /*
  820. * Synchronize between CPUs after main scanning loop.
  821. * This invokes the bulk of the Monarch processing.
  822. */
  823. static int mce_end(int order)
  824. {
  825. int ret = -1;
  826. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  827. if (!timeout)
  828. goto reset;
  829. if (order < 0)
  830. goto reset;
  831. /*
  832. * Allow others to run.
  833. */
  834. atomic_inc(&mce_executing);
  835. if (order == 1) {
  836. /* CHECKME: Can this race with a parallel hotplug? */
  837. int cpus = num_online_cpus();
  838. /*
  839. * Monarch: Wait for everyone to go through their scanning
  840. * loops.
  841. */
  842. while (atomic_read(&mce_executing) <= cpus) {
  843. if (mce_timed_out(&timeout,
  844. "Timeout: Monarch CPU unable to finish machine check processing"))
  845. goto reset;
  846. ndelay(SPINUNIT);
  847. }
  848. mce_reign();
  849. barrier();
  850. ret = 0;
  851. } else {
  852. /*
  853. * Subject: Wait for Monarch to finish.
  854. */
  855. while (atomic_read(&mce_executing) != 0) {
  856. if (mce_timed_out(&timeout,
  857. "Timeout: Monarch CPU did not finish machine check processing"))
  858. goto reset;
  859. ndelay(SPINUNIT);
  860. }
  861. /*
  862. * Don't reset anything. That's done by the Monarch.
  863. */
  864. return 0;
  865. }
  866. /*
  867. * Reset all global state.
  868. */
  869. reset:
  870. atomic_set(&global_nwo, 0);
  871. atomic_set(&mce_callin, 0);
  872. barrier();
  873. /*
  874. * Let others run again.
  875. */
  876. atomic_set(&mce_executing, 0);
  877. return ret;
  878. }
  879. static void mce_clear_state(unsigned long *toclear)
  880. {
  881. int i;
  882. for (i = 0; i < mca_cfg.banks; i++) {
  883. if (test_bit(i, toclear))
  884. mce_wrmsrl(msr_ops.status(i), 0);
  885. }
  886. }
  887. static int do_memory_failure(struct mce *m)
  888. {
  889. int flags = MF_ACTION_REQUIRED;
  890. int ret;
  891. pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
  892. if (!(m->mcgstatus & MCG_STATUS_RIPV))
  893. flags |= MF_MUST_KILL;
  894. ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
  895. if (ret)
  896. pr_err("Memory error not recovered");
  897. return ret;
  898. }
  899. /*
  900. * The actual machine check handler. This only handles real
  901. * exceptions when something got corrupted coming in through int 18.
  902. *
  903. * This is executed in NMI context not subject to normal locking rules. This
  904. * implies that most kernel services cannot be safely used. Don't even
  905. * think about putting a printk in there!
  906. *
  907. * On Intel systems this is entered on all CPUs in parallel through
  908. * MCE broadcast. However some CPUs might be broken beyond repair,
  909. * so be always careful when synchronizing with others.
  910. */
  911. void do_machine_check(struct pt_regs *regs, long error_code)
  912. {
  913. struct mca_config *cfg = &mca_cfg;
  914. struct mce m, *final;
  915. int i;
  916. int worst = 0;
  917. int severity;
  918. /*
  919. * Establish sequential order between the CPUs entering the machine
  920. * check handler.
  921. */
  922. int order = -1;
  923. /*
  924. * If no_way_out gets set, there is no safe way to recover from this
  925. * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
  926. */
  927. int no_way_out = 0;
  928. /*
  929. * If kill_it gets set, there might be a way to recover from this
  930. * error.
  931. */
  932. int kill_it = 0;
  933. DECLARE_BITMAP(toclear, MAX_NR_BANKS);
  934. DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
  935. char *msg = "Unknown";
  936. /*
  937. * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
  938. * on Intel.
  939. */
  940. int lmce = 1;
  941. int cpu = smp_processor_id();
  942. /*
  943. * Cases where we avoid rendezvous handler timeout:
  944. * 1) If this CPU is offline.
  945. *
  946. * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
  947. * skip those CPUs which remain looping in the 1st kernel - see
  948. * crash_nmi_callback().
  949. *
  950. * Note: there still is a small window between kexec-ing and the new,
  951. * kdump kernel establishing a new #MC handler where a broadcasted MCE
  952. * might not get handled properly.
  953. */
  954. if (cpu_is_offline(cpu) ||
  955. (crashing_cpu != -1 && crashing_cpu != cpu)) {
  956. u64 mcgstatus;
  957. mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
  958. if (mcgstatus & MCG_STATUS_RIPV) {
  959. mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  960. return;
  961. }
  962. }
  963. ist_enter(regs);
  964. this_cpu_inc(mce_exception_count);
  965. if (!cfg->banks)
  966. goto out;
  967. mce_gather_info(&m, regs);
  968. m.tsc = rdtsc();
  969. final = this_cpu_ptr(&mces_seen);
  970. *final = m;
  971. memset(valid_banks, 0, sizeof(valid_banks));
  972. no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
  973. barrier();
  974. /*
  975. * When no restart IP might need to kill or panic.
  976. * Assume the worst for now, but if we find the
  977. * severity is MCE_AR_SEVERITY we have other options.
  978. */
  979. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  980. kill_it = 1;
  981. /*
  982. * Check if this MCE is signaled to only this logical processor,
  983. * on Intel only.
  984. */
  985. if (m.cpuvendor == X86_VENDOR_INTEL)
  986. lmce = m.mcgstatus & MCG_STATUS_LMCES;
  987. /*
  988. * Go through all banks in exclusion of the other CPUs. This way we
  989. * don't report duplicated events on shared banks because the first one
  990. * to see it will clear it. If this is a Local MCE, then no need to
  991. * perform rendezvous.
  992. */
  993. if (!lmce)
  994. order = mce_start(&no_way_out);
  995. for (i = 0; i < cfg->banks; i++) {
  996. __clear_bit(i, toclear);
  997. if (!test_bit(i, valid_banks))
  998. continue;
  999. if (!mce_banks[i].ctl)
  1000. continue;
  1001. m.misc = 0;
  1002. m.addr = 0;
  1003. m.bank = i;
  1004. m.status = mce_rdmsrl(msr_ops.status(i));
  1005. if ((m.status & MCI_STATUS_VAL) == 0)
  1006. continue;
  1007. /*
  1008. * Non uncorrected or non signaled errors are handled by
  1009. * machine_check_poll. Leave them alone, unless this panics.
  1010. */
  1011. if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
  1012. !no_way_out)
  1013. continue;
  1014. /*
  1015. * Set taint even when machine check was not enabled.
  1016. */
  1017. add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
  1018. severity = mce_severity(&m, cfg->tolerant, NULL, true);
  1019. /*
  1020. * When machine check was for corrected/deferred handler don't
  1021. * touch, unless we're panicing.
  1022. */
  1023. if ((severity == MCE_KEEP_SEVERITY ||
  1024. severity == MCE_UCNA_SEVERITY) && !no_way_out)
  1025. continue;
  1026. __set_bit(i, toclear);
  1027. if (severity == MCE_NO_SEVERITY) {
  1028. /*
  1029. * Machine check event was not enabled. Clear, but
  1030. * ignore.
  1031. */
  1032. continue;
  1033. }
  1034. mce_read_aux(&m, i);
  1035. /* assuming valid severity level != 0 */
  1036. m.severity = severity;
  1037. mce_log(&m);
  1038. if (severity > worst) {
  1039. *final = m;
  1040. worst = severity;
  1041. }
  1042. }
  1043. /* mce_clear_state will clear *final, save locally for use later */
  1044. m = *final;
  1045. if (!no_way_out)
  1046. mce_clear_state(toclear);
  1047. /*
  1048. * Do most of the synchronization with other CPUs.
  1049. * When there's any problem use only local no_way_out state.
  1050. */
  1051. if (!lmce) {
  1052. if (mce_end(order) < 0)
  1053. no_way_out = worst >= MCE_PANIC_SEVERITY;
  1054. } else {
  1055. /*
  1056. * Local MCE skipped calling mce_reign()
  1057. * If we found a fatal error, we need to panic here.
  1058. */
  1059. if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  1060. mce_panic("Machine check from unknown source",
  1061. NULL, NULL);
  1062. }
  1063. /*
  1064. * If tolerant is at an insane level we drop requests to kill
  1065. * processes and continue even when there is no way out.
  1066. */
  1067. if (cfg->tolerant == 3)
  1068. kill_it = 0;
  1069. else if (no_way_out)
  1070. mce_panic("Fatal machine check on current CPU", &m, msg);
  1071. if (worst > 0)
  1072. mce_report_event(regs);
  1073. mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  1074. out:
  1075. sync_core();
  1076. if (worst != MCE_AR_SEVERITY && !kill_it)
  1077. goto out_ist;
  1078. /* Fault was in user mode and we need to take some action */
  1079. if ((m.cs & 3) == 3) {
  1080. ist_begin_non_atomic(regs);
  1081. local_irq_enable();
  1082. if (kill_it || do_memory_failure(&m))
  1083. force_sig(SIGBUS, current);
  1084. local_irq_disable();
  1085. ist_end_non_atomic();
  1086. } else {
  1087. if (!fixup_exception(regs, X86_TRAP_MC))
  1088. mce_panic("Failed kernel mode recovery", &m, NULL);
  1089. }
  1090. out_ist:
  1091. ist_exit(regs);
  1092. }
  1093. EXPORT_SYMBOL_GPL(do_machine_check);
  1094. #ifndef CONFIG_MEMORY_FAILURE
  1095. int memory_failure(unsigned long pfn, int vector, int flags)
  1096. {
  1097. /* mce_severity() should not hand us an ACTION_REQUIRED error */
  1098. BUG_ON(flags & MF_ACTION_REQUIRED);
  1099. pr_err("Uncorrected memory error in page 0x%lx ignored\n"
  1100. "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
  1101. pfn);
  1102. return 0;
  1103. }
  1104. #endif
  1105. /*
  1106. * Periodic polling timer for "silent" machine check errors. If the
  1107. * poller finds an MCE, poll 2x faster. When the poller finds no more
  1108. * errors, poll 2x slower (up to check_interval seconds).
  1109. */
  1110. static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
  1111. static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  1112. static DEFINE_PER_CPU(struct timer_list, mce_timer);
  1113. static unsigned long mce_adjust_timer_default(unsigned long interval)
  1114. {
  1115. return interval;
  1116. }
  1117. static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
  1118. static void __start_timer(struct timer_list *t, unsigned long interval)
  1119. {
  1120. unsigned long when = jiffies + interval;
  1121. unsigned long flags;
  1122. local_irq_save(flags);
  1123. if (!timer_pending(t) || time_before(when, t->expires))
  1124. mod_timer(t, round_jiffies(when));
  1125. local_irq_restore(flags);
  1126. }
  1127. static void mce_timer_fn(unsigned long data)
  1128. {
  1129. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1130. int cpu = smp_processor_id();
  1131. unsigned long iv;
  1132. WARN_ON(cpu != data);
  1133. iv = __this_cpu_read(mce_next_interval);
  1134. if (mce_available(this_cpu_ptr(&cpu_info))) {
  1135. machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
  1136. if (mce_intel_cmci_poll()) {
  1137. iv = mce_adjust_timer(iv);
  1138. goto done;
  1139. }
  1140. }
  1141. /*
  1142. * Alert userspace if needed. If we logged an MCE, reduce the polling
  1143. * interval, otherwise increase the polling interval.
  1144. */
  1145. if (mce_notify_irq())
  1146. iv = max(iv / 2, (unsigned long) HZ/100);
  1147. else
  1148. iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
  1149. done:
  1150. __this_cpu_write(mce_next_interval, iv);
  1151. __start_timer(t, iv);
  1152. }
  1153. /*
  1154. * Ensure that the timer is firing in @interval from now.
  1155. */
  1156. void mce_timer_kick(unsigned long interval)
  1157. {
  1158. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1159. unsigned long iv = __this_cpu_read(mce_next_interval);
  1160. __start_timer(t, interval);
  1161. if (interval < iv)
  1162. __this_cpu_write(mce_next_interval, interval);
  1163. }
  1164. /* Must not be called in IRQ context where del_timer_sync() can deadlock */
  1165. static void mce_timer_delete_all(void)
  1166. {
  1167. int cpu;
  1168. for_each_online_cpu(cpu)
  1169. del_timer_sync(&per_cpu(mce_timer, cpu));
  1170. }
  1171. /*
  1172. * Notify the user(s) about new machine check events.
  1173. * Can be called from interrupt context, but not from machine check/NMI
  1174. * context.
  1175. */
  1176. int mce_notify_irq(void)
  1177. {
  1178. /* Not more than two messages every minute */
  1179. static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  1180. if (test_and_clear_bit(0, &mce_need_notify)) {
  1181. mce_work_trigger();
  1182. if (__ratelimit(&ratelimit))
  1183. pr_info(HW_ERR "Machine check events logged\n");
  1184. return 1;
  1185. }
  1186. return 0;
  1187. }
  1188. EXPORT_SYMBOL_GPL(mce_notify_irq);
  1189. static int __mcheck_cpu_mce_banks_init(void)
  1190. {
  1191. int i;
  1192. u8 num_banks = mca_cfg.banks;
  1193. mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
  1194. if (!mce_banks)
  1195. return -ENOMEM;
  1196. for (i = 0; i < num_banks; i++) {
  1197. struct mce_bank *b = &mce_banks[i];
  1198. b->ctl = -1ULL;
  1199. b->init = 1;
  1200. }
  1201. return 0;
  1202. }
  1203. /*
  1204. * Initialize Machine Checks for a CPU.
  1205. */
  1206. static int __mcheck_cpu_cap_init(void)
  1207. {
  1208. unsigned b;
  1209. u64 cap;
  1210. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1211. b = cap & MCG_BANKCNT_MASK;
  1212. if (!mca_cfg.banks)
  1213. pr_info("CPU supports %d MCE banks\n", b);
  1214. if (b > MAX_NR_BANKS) {
  1215. pr_warn("Using only %u machine check banks out of %u\n",
  1216. MAX_NR_BANKS, b);
  1217. b = MAX_NR_BANKS;
  1218. }
  1219. /* Don't support asymmetric configurations today */
  1220. WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
  1221. mca_cfg.banks = b;
  1222. if (!mce_banks) {
  1223. int err = __mcheck_cpu_mce_banks_init();
  1224. if (err)
  1225. return err;
  1226. }
  1227. /* Use accurate RIP reporting if available. */
  1228. if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
  1229. mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
  1230. if (cap & MCG_SER_P)
  1231. mca_cfg.ser = true;
  1232. return 0;
  1233. }
  1234. static void __mcheck_cpu_init_generic(void)
  1235. {
  1236. enum mcp_flags m_fl = 0;
  1237. mce_banks_t all_banks;
  1238. u64 cap;
  1239. if (!mca_cfg.bootlog)
  1240. m_fl = MCP_DONTLOG;
  1241. /*
  1242. * Log the machine checks left over from the previous reset.
  1243. */
  1244. bitmap_fill(all_banks, MAX_NR_BANKS);
  1245. machine_check_poll(MCP_UC | m_fl, &all_banks);
  1246. cr4_set_bits(X86_CR4_MCE);
  1247. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1248. if (cap & MCG_CTL_P)
  1249. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  1250. }
  1251. static void __mcheck_cpu_init_clear_banks(void)
  1252. {
  1253. int i;
  1254. for (i = 0; i < mca_cfg.banks; i++) {
  1255. struct mce_bank *b = &mce_banks[i];
  1256. if (!b->init)
  1257. continue;
  1258. wrmsrl(msr_ops.ctl(i), b->ctl);
  1259. wrmsrl(msr_ops.status(i), 0);
  1260. }
  1261. }
  1262. /*
  1263. * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
  1264. * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
  1265. * Vol 3B Table 15-20). But this confuses both the code that determines
  1266. * whether the machine check occurred in kernel or user mode, and also
  1267. * the severity assessment code. Pretend that EIPV was set, and take the
  1268. * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
  1269. */
  1270. static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
  1271. {
  1272. if (bank != 0)
  1273. return;
  1274. if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
  1275. return;
  1276. if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
  1277. MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
  1278. MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
  1279. MCACOD)) !=
  1280. (MCI_STATUS_UC|MCI_STATUS_EN|
  1281. MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
  1282. MCI_STATUS_AR|MCACOD_INSTR))
  1283. return;
  1284. m->mcgstatus |= MCG_STATUS_EIPV;
  1285. m->ip = regs->ip;
  1286. m->cs = regs->cs;
  1287. }
  1288. /* Add per CPU specific workarounds here */
  1289. static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
  1290. {
  1291. struct mca_config *cfg = &mca_cfg;
  1292. if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
  1293. pr_info("unknown CPU type - not enabling MCE support\n");
  1294. return -EOPNOTSUPP;
  1295. }
  1296. /* This should be disabled by the BIOS, but isn't always */
  1297. if (c->x86_vendor == X86_VENDOR_AMD) {
  1298. if (c->x86 == 15 && cfg->banks > 4) {
  1299. /*
  1300. * disable GART TBL walk error reporting, which
  1301. * trips off incorrectly with the IOMMU & 3ware
  1302. * & Cerberus:
  1303. */
  1304. clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
  1305. }
  1306. if (c->x86 < 17 && cfg->bootlog < 0) {
  1307. /*
  1308. * Lots of broken BIOS around that don't clear them
  1309. * by default and leave crap in there. Don't log:
  1310. */
  1311. cfg->bootlog = 0;
  1312. }
  1313. /*
  1314. * Various K7s with broken bank 0 around. Always disable
  1315. * by default.
  1316. */
  1317. if (c->x86 == 6 && cfg->banks > 0)
  1318. mce_banks[0].ctl = 0;
  1319. /*
  1320. * overflow_recov is supported for F15h Models 00h-0fh
  1321. * even though we don't have a CPUID bit for it.
  1322. */
  1323. if (c->x86 == 0x15 && c->x86_model <= 0xf)
  1324. mce_flags.overflow_recov = 1;
  1325. /*
  1326. * Turn off MC4_MISC thresholding banks on those models since
  1327. * they're not supported there.
  1328. */
  1329. if (c->x86 == 0x15 &&
  1330. (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
  1331. int i;
  1332. u64 hwcr;
  1333. bool need_toggle;
  1334. u32 msrs[] = {
  1335. 0x00000413, /* MC4_MISC0 */
  1336. 0xc0000408, /* MC4_MISC1 */
  1337. };
  1338. rdmsrl(MSR_K7_HWCR, hwcr);
  1339. /* McStatusWrEn has to be set */
  1340. need_toggle = !(hwcr & BIT(18));
  1341. if (need_toggle)
  1342. wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
  1343. /* Clear CntP bit safely */
  1344. for (i = 0; i < ARRAY_SIZE(msrs); i++)
  1345. msr_clear_bit(msrs[i], 62);
  1346. /* restore old settings */
  1347. if (need_toggle)
  1348. wrmsrl(MSR_K7_HWCR, hwcr);
  1349. }
  1350. }
  1351. if (c->x86_vendor == X86_VENDOR_INTEL) {
  1352. /*
  1353. * SDM documents that on family 6 bank 0 should not be written
  1354. * because it aliases to another special BIOS controlled
  1355. * register.
  1356. * But it's not aliased anymore on model 0x1a+
  1357. * Don't ignore bank 0 completely because there could be a
  1358. * valid event later, merely don't write CTL0.
  1359. */
  1360. if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
  1361. mce_banks[0].init = 0;
  1362. /*
  1363. * All newer Intel systems support MCE broadcasting. Enable
  1364. * synchronization with a one second timeout.
  1365. */
  1366. if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
  1367. cfg->monarch_timeout < 0)
  1368. cfg->monarch_timeout = USEC_PER_SEC;
  1369. /*
  1370. * There are also broken BIOSes on some Pentium M and
  1371. * earlier systems:
  1372. */
  1373. if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
  1374. cfg->bootlog = 0;
  1375. if (c->x86 == 6 && c->x86_model == 45)
  1376. quirk_no_way_out = quirk_sandybridge_ifu;
  1377. }
  1378. if (cfg->monarch_timeout < 0)
  1379. cfg->monarch_timeout = 0;
  1380. if (cfg->bootlog != 0)
  1381. cfg->panic_timeout = 30;
  1382. return 0;
  1383. }
  1384. static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
  1385. {
  1386. if (c->x86 != 5)
  1387. return 0;
  1388. switch (c->x86_vendor) {
  1389. case X86_VENDOR_INTEL:
  1390. intel_p5_mcheck_init(c);
  1391. return 1;
  1392. break;
  1393. case X86_VENDOR_CENTAUR:
  1394. winchip_mcheck_init(c);
  1395. return 1;
  1396. break;
  1397. default:
  1398. return 0;
  1399. }
  1400. return 0;
  1401. }
  1402. /*
  1403. * Init basic CPU features needed for early decoding of MCEs.
  1404. */
  1405. static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
  1406. {
  1407. if (c->x86_vendor == X86_VENDOR_AMD) {
  1408. mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
  1409. mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
  1410. mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
  1411. if (mce_flags.smca) {
  1412. msr_ops.ctl = smca_ctl_reg;
  1413. msr_ops.status = smca_status_reg;
  1414. msr_ops.addr = smca_addr_reg;
  1415. msr_ops.misc = smca_misc_reg;
  1416. }
  1417. }
  1418. }
  1419. static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
  1420. {
  1421. switch (c->x86_vendor) {
  1422. case X86_VENDOR_INTEL:
  1423. mce_intel_feature_init(c);
  1424. mce_adjust_timer = cmci_intel_adjust_timer;
  1425. break;
  1426. case X86_VENDOR_AMD: {
  1427. mce_amd_feature_init(c);
  1428. break;
  1429. }
  1430. default:
  1431. break;
  1432. }
  1433. }
  1434. static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
  1435. {
  1436. switch (c->x86_vendor) {
  1437. case X86_VENDOR_INTEL:
  1438. mce_intel_feature_clear(c);
  1439. break;
  1440. default:
  1441. break;
  1442. }
  1443. }
  1444. static void mce_start_timer(struct timer_list *t)
  1445. {
  1446. unsigned long iv = check_interval * HZ;
  1447. if (mca_cfg.ignore_ce || !iv)
  1448. return;
  1449. this_cpu_write(mce_next_interval, iv);
  1450. __start_timer(t, iv);
  1451. }
  1452. static void __mcheck_cpu_setup_timer(void)
  1453. {
  1454. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1455. unsigned int cpu = smp_processor_id();
  1456. setup_pinned_timer(t, mce_timer_fn, cpu);
  1457. }
  1458. static void __mcheck_cpu_init_timer(void)
  1459. {
  1460. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1461. unsigned int cpu = smp_processor_id();
  1462. setup_pinned_timer(t, mce_timer_fn, cpu);
  1463. mce_start_timer(t);
  1464. }
  1465. /* Handle unconfigured int18 (should never happen) */
  1466. static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  1467. {
  1468. pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
  1469. smp_processor_id());
  1470. }
  1471. /* Call the installed machine check handler for this CPU setup. */
  1472. void (*machine_check_vector)(struct pt_regs *, long error_code) =
  1473. unexpected_machine_check;
  1474. /*
  1475. * Called for each booted CPU to set up machine checks.
  1476. * Must be called with preempt off:
  1477. */
  1478. void mcheck_cpu_init(struct cpuinfo_x86 *c)
  1479. {
  1480. if (mca_cfg.disabled)
  1481. return;
  1482. if (__mcheck_cpu_ancient_init(c))
  1483. return;
  1484. if (!mce_available(c))
  1485. return;
  1486. if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
  1487. mca_cfg.disabled = true;
  1488. return;
  1489. }
  1490. if (mce_gen_pool_init()) {
  1491. mca_cfg.disabled = true;
  1492. pr_emerg("Couldn't allocate MCE records pool!\n");
  1493. return;
  1494. }
  1495. machine_check_vector = do_machine_check;
  1496. __mcheck_cpu_init_early(c);
  1497. __mcheck_cpu_init_generic();
  1498. __mcheck_cpu_init_vendor(c);
  1499. __mcheck_cpu_init_clear_banks();
  1500. __mcheck_cpu_setup_timer();
  1501. }
  1502. /*
  1503. * Called for each booted CPU to clear some machine checks opt-ins
  1504. */
  1505. void mcheck_cpu_clear(struct cpuinfo_x86 *c)
  1506. {
  1507. if (mca_cfg.disabled)
  1508. return;
  1509. if (!mce_available(c))
  1510. return;
  1511. /*
  1512. * Possibly to clear general settings generic to x86
  1513. * __mcheck_cpu_clear_generic(c);
  1514. */
  1515. __mcheck_cpu_clear_vendor(c);
  1516. }
  1517. static void __mce_disable_bank(void *arg)
  1518. {
  1519. int bank = *((int *)arg);
  1520. __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
  1521. cmci_disable_bank(bank);
  1522. }
  1523. void mce_disable_bank(int bank)
  1524. {
  1525. if (bank >= mca_cfg.banks) {
  1526. pr_warn(FW_BUG
  1527. "Ignoring request to disable invalid MCA bank %d.\n",
  1528. bank);
  1529. return;
  1530. }
  1531. set_bit(bank, mce_banks_ce_disabled);
  1532. on_each_cpu(__mce_disable_bank, &bank, 1);
  1533. }
  1534. /*
  1535. * mce=off Disables machine check
  1536. * mce=no_cmci Disables CMCI
  1537. * mce=no_lmce Disables LMCE
  1538. * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
  1539. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  1540. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
  1541. * monarchtimeout is how long to wait for other CPUs on machine
  1542. * check, or 0 to not wait
  1543. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  1544. * mce=nobootlog Don't log MCEs from before booting.
  1545. * mce=bios_cmci_threshold Don't program the CMCI threshold
  1546. * mce=recovery force enable memcpy_mcsafe()
  1547. */
  1548. static int __init mcheck_enable(char *str)
  1549. {
  1550. struct mca_config *cfg = &mca_cfg;
  1551. if (*str == 0) {
  1552. enable_p5_mce();
  1553. return 1;
  1554. }
  1555. if (*str == '=')
  1556. str++;
  1557. if (!strcmp(str, "off"))
  1558. cfg->disabled = true;
  1559. else if (!strcmp(str, "no_cmci"))
  1560. cfg->cmci_disabled = true;
  1561. else if (!strcmp(str, "no_lmce"))
  1562. cfg->lmce_disabled = true;
  1563. else if (!strcmp(str, "dont_log_ce"))
  1564. cfg->dont_log_ce = true;
  1565. else if (!strcmp(str, "ignore_ce"))
  1566. cfg->ignore_ce = true;
  1567. else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
  1568. cfg->bootlog = (str[0] == 'b');
  1569. else if (!strcmp(str, "bios_cmci_threshold"))
  1570. cfg->bios_cmci_threshold = true;
  1571. else if (!strcmp(str, "recovery"))
  1572. cfg->recovery = true;
  1573. else if (isdigit(str[0])) {
  1574. if (get_option(&str, &cfg->tolerant) == 2)
  1575. get_option(&str, &(cfg->monarch_timeout));
  1576. } else {
  1577. pr_info("mce argument %s ignored. Please use /sys\n", str);
  1578. return 0;
  1579. }
  1580. return 1;
  1581. }
  1582. __setup("mce", mcheck_enable);
  1583. int __init mcheck_init(void)
  1584. {
  1585. mcheck_intel_therm_init();
  1586. mce_register_decode_chain(&first_nb);
  1587. mce_register_decode_chain(&mce_srao_nb);
  1588. mce_register_decode_chain(&mce_default_nb);
  1589. mcheck_vendor_init_severity();
  1590. INIT_WORK(&mce_work, mce_gen_pool_process);
  1591. init_irq_work(&mce_irq_work, mce_irq_work_cb);
  1592. return 0;
  1593. }
  1594. /*
  1595. * mce_syscore: PM support
  1596. */
  1597. /*
  1598. * Disable machine checks on suspend and shutdown. We can't really handle
  1599. * them later.
  1600. */
  1601. static void mce_disable_error_reporting(void)
  1602. {
  1603. int i;
  1604. for (i = 0; i < mca_cfg.banks; i++) {
  1605. struct mce_bank *b = &mce_banks[i];
  1606. if (b->init)
  1607. wrmsrl(msr_ops.ctl(i), 0);
  1608. }
  1609. return;
  1610. }
  1611. static void vendor_disable_error_reporting(void)
  1612. {
  1613. /*
  1614. * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
  1615. * Disabling them for just a single offlined CPU is bad, since it will
  1616. * inhibit reporting for all shared resources on the socket like the
  1617. * last level cache (LLC), the integrated memory controller (iMC), etc.
  1618. */
  1619. if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  1620. return;
  1621. mce_disable_error_reporting();
  1622. }
  1623. static int mce_syscore_suspend(void)
  1624. {
  1625. vendor_disable_error_reporting();
  1626. return 0;
  1627. }
  1628. static void mce_syscore_shutdown(void)
  1629. {
  1630. vendor_disable_error_reporting();
  1631. }
  1632. /*
  1633. * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
  1634. * Only one CPU is active at this time, the others get re-added later using
  1635. * CPU hotplug:
  1636. */
  1637. static void mce_syscore_resume(void)
  1638. {
  1639. __mcheck_cpu_init_generic();
  1640. __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
  1641. __mcheck_cpu_init_clear_banks();
  1642. }
  1643. static struct syscore_ops mce_syscore_ops = {
  1644. .suspend = mce_syscore_suspend,
  1645. .shutdown = mce_syscore_shutdown,
  1646. .resume = mce_syscore_resume,
  1647. };
  1648. /*
  1649. * mce_device: Sysfs support
  1650. */
  1651. static void mce_cpu_restart(void *data)
  1652. {
  1653. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1654. return;
  1655. __mcheck_cpu_init_generic();
  1656. __mcheck_cpu_init_clear_banks();
  1657. __mcheck_cpu_init_timer();
  1658. }
  1659. /* Reinit MCEs after user configuration changes */
  1660. static void mce_restart(void)
  1661. {
  1662. mce_timer_delete_all();
  1663. on_each_cpu(mce_cpu_restart, NULL, 1);
  1664. }
  1665. /* Toggle features for corrected errors */
  1666. static void mce_disable_cmci(void *data)
  1667. {
  1668. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1669. return;
  1670. cmci_clear();
  1671. }
  1672. static void mce_enable_ce(void *all)
  1673. {
  1674. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1675. return;
  1676. cmci_reenable();
  1677. cmci_recheck();
  1678. if (all)
  1679. __mcheck_cpu_init_timer();
  1680. }
  1681. static struct bus_type mce_subsys = {
  1682. .name = "machinecheck",
  1683. .dev_name = "machinecheck",
  1684. };
  1685. DEFINE_PER_CPU(struct device *, mce_device);
  1686. static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
  1687. {
  1688. return container_of(attr, struct mce_bank, attr);
  1689. }
  1690. static ssize_t show_bank(struct device *s, struct device_attribute *attr,
  1691. char *buf)
  1692. {
  1693. return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
  1694. }
  1695. static ssize_t set_bank(struct device *s, struct device_attribute *attr,
  1696. const char *buf, size_t size)
  1697. {
  1698. u64 new;
  1699. if (kstrtou64(buf, 0, &new) < 0)
  1700. return -EINVAL;
  1701. attr_to_bank(attr)->ctl = new;
  1702. mce_restart();
  1703. return size;
  1704. }
  1705. static ssize_t set_ignore_ce(struct device *s,
  1706. struct device_attribute *attr,
  1707. const char *buf, size_t size)
  1708. {
  1709. u64 new;
  1710. if (kstrtou64(buf, 0, &new) < 0)
  1711. return -EINVAL;
  1712. if (mca_cfg.ignore_ce ^ !!new) {
  1713. if (new) {
  1714. /* disable ce features */
  1715. mce_timer_delete_all();
  1716. on_each_cpu(mce_disable_cmci, NULL, 1);
  1717. mca_cfg.ignore_ce = true;
  1718. } else {
  1719. /* enable ce features */
  1720. mca_cfg.ignore_ce = false;
  1721. on_each_cpu(mce_enable_ce, (void *)1, 1);
  1722. }
  1723. }
  1724. return size;
  1725. }
  1726. static ssize_t set_cmci_disabled(struct device *s,
  1727. struct device_attribute *attr,
  1728. const char *buf, size_t size)
  1729. {
  1730. u64 new;
  1731. if (kstrtou64(buf, 0, &new) < 0)
  1732. return -EINVAL;
  1733. if (mca_cfg.cmci_disabled ^ !!new) {
  1734. if (new) {
  1735. /* disable cmci */
  1736. on_each_cpu(mce_disable_cmci, NULL, 1);
  1737. mca_cfg.cmci_disabled = true;
  1738. } else {
  1739. /* enable cmci */
  1740. mca_cfg.cmci_disabled = false;
  1741. on_each_cpu(mce_enable_ce, NULL, 1);
  1742. }
  1743. }
  1744. return size;
  1745. }
  1746. static ssize_t store_int_with_restart(struct device *s,
  1747. struct device_attribute *attr,
  1748. const char *buf, size_t size)
  1749. {
  1750. ssize_t ret = device_store_int(s, attr, buf, size);
  1751. mce_restart();
  1752. return ret;
  1753. }
  1754. static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
  1755. static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
  1756. static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
  1757. static struct dev_ext_attribute dev_attr_check_interval = {
  1758. __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
  1759. &check_interval
  1760. };
  1761. static struct dev_ext_attribute dev_attr_ignore_ce = {
  1762. __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
  1763. &mca_cfg.ignore_ce
  1764. };
  1765. static struct dev_ext_attribute dev_attr_cmci_disabled = {
  1766. __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
  1767. &mca_cfg.cmci_disabled
  1768. };
  1769. static struct device_attribute *mce_device_attrs[] = {
  1770. &dev_attr_tolerant.attr,
  1771. &dev_attr_check_interval.attr,
  1772. #ifdef CONFIG_X86_MCELOG_LEGACY
  1773. &dev_attr_trigger,
  1774. #endif
  1775. &dev_attr_monarch_timeout.attr,
  1776. &dev_attr_dont_log_ce.attr,
  1777. &dev_attr_ignore_ce.attr,
  1778. &dev_attr_cmci_disabled.attr,
  1779. NULL
  1780. };
  1781. static cpumask_var_t mce_device_initialized;
  1782. static void mce_device_release(struct device *dev)
  1783. {
  1784. kfree(dev);
  1785. }
  1786. /* Per cpu device init. All of the cpus still share the same ctrl bank: */
  1787. static int mce_device_create(unsigned int cpu)
  1788. {
  1789. struct device *dev;
  1790. int err;
  1791. int i, j;
  1792. if (!mce_available(&boot_cpu_data))
  1793. return -EIO;
  1794. dev = per_cpu(mce_device, cpu);
  1795. if (dev)
  1796. return 0;
  1797. dev = kzalloc(sizeof *dev, GFP_KERNEL);
  1798. if (!dev)
  1799. return -ENOMEM;
  1800. dev->id = cpu;
  1801. dev->bus = &mce_subsys;
  1802. dev->release = &mce_device_release;
  1803. err = device_register(dev);
  1804. if (err) {
  1805. put_device(dev);
  1806. return err;
  1807. }
  1808. for (i = 0; mce_device_attrs[i]; i++) {
  1809. err = device_create_file(dev, mce_device_attrs[i]);
  1810. if (err)
  1811. goto error;
  1812. }
  1813. for (j = 0; j < mca_cfg.banks; j++) {
  1814. err = device_create_file(dev, &mce_banks[j].attr);
  1815. if (err)
  1816. goto error2;
  1817. }
  1818. cpumask_set_cpu(cpu, mce_device_initialized);
  1819. per_cpu(mce_device, cpu) = dev;
  1820. return 0;
  1821. error2:
  1822. while (--j >= 0)
  1823. device_remove_file(dev, &mce_banks[j].attr);
  1824. error:
  1825. while (--i >= 0)
  1826. device_remove_file(dev, mce_device_attrs[i]);
  1827. device_unregister(dev);
  1828. return err;
  1829. }
  1830. static void mce_device_remove(unsigned int cpu)
  1831. {
  1832. struct device *dev = per_cpu(mce_device, cpu);
  1833. int i;
  1834. if (!cpumask_test_cpu(cpu, mce_device_initialized))
  1835. return;
  1836. for (i = 0; mce_device_attrs[i]; i++)
  1837. device_remove_file(dev, mce_device_attrs[i]);
  1838. for (i = 0; i < mca_cfg.banks; i++)
  1839. device_remove_file(dev, &mce_banks[i].attr);
  1840. device_unregister(dev);
  1841. cpumask_clear_cpu(cpu, mce_device_initialized);
  1842. per_cpu(mce_device, cpu) = NULL;
  1843. }
  1844. /* Make sure there are no machine checks on offlined CPUs. */
  1845. static void mce_disable_cpu(void)
  1846. {
  1847. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1848. return;
  1849. if (!cpuhp_tasks_frozen)
  1850. cmci_clear();
  1851. vendor_disable_error_reporting();
  1852. }
  1853. static void mce_reenable_cpu(void)
  1854. {
  1855. int i;
  1856. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1857. return;
  1858. if (!cpuhp_tasks_frozen)
  1859. cmci_reenable();
  1860. for (i = 0; i < mca_cfg.banks; i++) {
  1861. struct mce_bank *b = &mce_banks[i];
  1862. if (b->init)
  1863. wrmsrl(msr_ops.ctl(i), b->ctl);
  1864. }
  1865. }
  1866. static int mce_cpu_dead(unsigned int cpu)
  1867. {
  1868. mce_intel_hcpu_update(cpu);
  1869. /* intentionally ignoring frozen here */
  1870. if (!cpuhp_tasks_frozen)
  1871. cmci_rediscover();
  1872. return 0;
  1873. }
  1874. static int mce_cpu_online(unsigned int cpu)
  1875. {
  1876. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1877. int ret;
  1878. mce_device_create(cpu);
  1879. ret = mce_threshold_create_device(cpu);
  1880. if (ret) {
  1881. mce_device_remove(cpu);
  1882. return ret;
  1883. }
  1884. mce_reenable_cpu();
  1885. mce_start_timer(t);
  1886. return 0;
  1887. }
  1888. static int mce_cpu_pre_down(unsigned int cpu)
  1889. {
  1890. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1891. mce_disable_cpu();
  1892. del_timer_sync(t);
  1893. mce_threshold_remove_device(cpu);
  1894. mce_device_remove(cpu);
  1895. return 0;
  1896. }
  1897. static __init void mce_init_banks(void)
  1898. {
  1899. int i;
  1900. for (i = 0; i < mca_cfg.banks; i++) {
  1901. struct mce_bank *b = &mce_banks[i];
  1902. struct device_attribute *a = &b->attr;
  1903. sysfs_attr_init(&a->attr);
  1904. a->attr.name = b->attrname;
  1905. snprintf(b->attrname, ATTR_LEN, "bank%d", i);
  1906. a->attr.mode = 0644;
  1907. a->show = show_bank;
  1908. a->store = set_bank;
  1909. }
  1910. }
  1911. static __init int mcheck_init_device(void)
  1912. {
  1913. int err;
  1914. if (!mce_available(&boot_cpu_data)) {
  1915. err = -EIO;
  1916. goto err_out;
  1917. }
  1918. if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
  1919. err = -ENOMEM;
  1920. goto err_out;
  1921. }
  1922. mce_init_banks();
  1923. err = subsys_system_register(&mce_subsys, NULL);
  1924. if (err)
  1925. goto err_out_mem;
  1926. err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
  1927. mce_cpu_dead);
  1928. if (err)
  1929. goto err_out_mem;
  1930. err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
  1931. mce_cpu_online, mce_cpu_pre_down);
  1932. if (err < 0)
  1933. goto err_out_online;
  1934. register_syscore_ops(&mce_syscore_ops);
  1935. return 0;
  1936. err_out_online:
  1937. cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
  1938. err_out_mem:
  1939. free_cpumask_var(mce_device_initialized);
  1940. err_out:
  1941. pr_err("Unable to init MCE device (rc: %d)\n", err);
  1942. return err;
  1943. }
  1944. device_initcall_sync(mcheck_init_device);
  1945. /*
  1946. * Old style boot options parsing. Only for compatibility.
  1947. */
  1948. static int __init mcheck_disable(char *str)
  1949. {
  1950. mca_cfg.disabled = true;
  1951. return 1;
  1952. }
  1953. __setup("nomce", mcheck_disable);
  1954. #ifdef CONFIG_DEBUG_FS
  1955. struct dentry *mce_get_debugfs_dir(void)
  1956. {
  1957. static struct dentry *dmce;
  1958. if (!dmce)
  1959. dmce = debugfs_create_dir("mce", NULL);
  1960. return dmce;
  1961. }
  1962. static void mce_reset(void)
  1963. {
  1964. cpu_missing = 0;
  1965. atomic_set(&mce_fake_panicked, 0);
  1966. atomic_set(&mce_executing, 0);
  1967. atomic_set(&mce_callin, 0);
  1968. atomic_set(&global_nwo, 0);
  1969. }
  1970. static int fake_panic_get(void *data, u64 *val)
  1971. {
  1972. *val = fake_panic;
  1973. return 0;
  1974. }
  1975. static int fake_panic_set(void *data, u64 val)
  1976. {
  1977. mce_reset();
  1978. fake_panic = val;
  1979. return 0;
  1980. }
  1981. DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
  1982. fake_panic_set, "%llu\n");
  1983. static int __init mcheck_debugfs_init(void)
  1984. {
  1985. struct dentry *dmce, *ffake_panic;
  1986. dmce = mce_get_debugfs_dir();
  1987. if (!dmce)
  1988. return -ENOMEM;
  1989. ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
  1990. &fake_panic_fops);
  1991. if (!ffake_panic)
  1992. return -ENOMEM;
  1993. return 0;
  1994. }
  1995. #else
  1996. static int __init mcheck_debugfs_init(void) { return -EINVAL; }
  1997. #endif
  1998. DEFINE_STATIC_KEY_FALSE(mcsafe_key);
  1999. EXPORT_SYMBOL_GPL(mcsafe_key);
  2000. static int __init mcheck_late_init(void)
  2001. {
  2002. if (mca_cfg.recovery)
  2003. static_branch_inc(&mcsafe_key);
  2004. mcheck_debugfs_init();
  2005. cec_init();
  2006. /*
  2007. * Flush out everything that has been logged during early boot, now that
  2008. * everything has been initialized (workqueues, decoders, ...).
  2009. */
  2010. mce_schedule_work();
  2011. return 0;
  2012. }
  2013. late_initcall(mcheck_late_init);