mce.c 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610
  1. /*
  2. * Machine check handler.
  3. *
  4. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5. * Rest from unknown author(s).
  6. * 2004 Andi Kleen. Rewrote most of it.
  7. * Copyright 2008 Intel Corporation
  8. * Author: Andi Kleen
  9. */
  10. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11. #include <linux/thread_info.h>
  12. #include <linux/capability.h>
  13. #include <linux/miscdevice.h>
  14. #include <linux/ratelimit.h>
  15. #include <linux/kallsyms.h>
  16. #include <linux/rcupdate.h>
  17. #include <linux/kobject.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/kdebug.h>
  20. #include <linux/kernel.h>
  21. #include <linux/percpu.h>
  22. #include <linux/string.h>
  23. #include <linux/device.h>
  24. #include <linux/syscore_ops.h>
  25. #include <linux/delay.h>
  26. #include <linux/ctype.h>
  27. #include <linux/sched.h>
  28. #include <linux/sysfs.h>
  29. #include <linux/types.h>
  30. #include <linux/slab.h>
  31. #include <linux/init.h>
  32. #include <linux/kmod.h>
  33. #include <linux/poll.h>
  34. #include <linux/nmi.h>
  35. #include <linux/cpu.h>
  36. #include <linux/smp.h>
  37. #include <linux/fs.h>
  38. #include <linux/mm.h>
  39. #include <linux/debugfs.h>
  40. #include <linux/irq_work.h>
  41. #include <linux/export.h>
  42. #include <asm/processor.h>
  43. #include <asm/traps.h>
  44. #include <asm/tlbflush.h>
  45. #include <asm/mce.h>
  46. #include <asm/msr.h>
  47. #include "mce-internal.h"
  48. static DEFINE_MUTEX(mce_chrdev_read_mutex);
  49. #define mce_log_get_idx_check(p) \
  50. ({ \
  51. RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  52. !lockdep_is_held(&mce_chrdev_read_mutex), \
  53. "suspicious mce_log_get_idx_check() usage"); \
  54. smp_load_acquire(&(p)); \
  55. })
  56. #define CREATE_TRACE_POINTS
  57. #include <trace/events/mce.h>
  58. #define SPINUNIT 100 /* 100ns */
  59. DEFINE_PER_CPU(unsigned, mce_exception_count);
  60. struct mce_bank *mce_banks __read_mostly;
  61. struct mce_vendor_flags mce_flags __read_mostly;
  62. struct mca_config mca_cfg __read_mostly = {
  63. .bootlog = -1,
  64. /*
  65. * Tolerant levels:
  66. * 0: always panic on uncorrected errors, log corrected errors
  67. * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  68. * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  69. * 3: never panic or SIGBUS, log all errors (for testing only)
  70. */
  71. .tolerant = 1,
  72. .monarch_timeout = -1
  73. };
  74. /* User mode helper program triggered by machine check event */
  75. static unsigned long mce_need_notify;
  76. static char mce_helper[128];
  77. static char *mce_helper_argv[2] = { mce_helper, NULL };
  78. static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  79. static DEFINE_PER_CPU(struct mce, mces_seen);
  80. static int cpu_missing;
  81. /*
  82. * MCA banks polled by the period polling timer for corrected events.
  83. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  84. */
  85. DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  86. [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  87. };
  88. /*
  89. * MCA banks controlled through firmware first for corrected errors.
  90. * This is a global list of banks for which we won't enable CMCI and we
  91. * won't poll. Firmware controls these banks and is responsible for
  92. * reporting corrected errors through GHES. Uncorrected/recoverable
  93. * errors are still notified through a machine check.
  94. */
  95. mce_banks_t mce_banks_ce_disabled;
  96. static struct work_struct mce_work;
  97. static struct irq_work mce_irq_work;
  98. static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  99. /*
  100. * CPU/chipset specific EDAC code can register a notifier call here to print
  101. * MCE errors in a human-readable form.
  102. */
  103. ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
  104. /* Do initial initialization of a struct mce */
  105. void mce_setup(struct mce *m)
  106. {
  107. memset(m, 0, sizeof(struct mce));
  108. m->cpu = m->extcpu = smp_processor_id();
  109. m->tsc = rdtsc();
  110. /* We hope get_seconds stays lockless */
  111. m->time = get_seconds();
  112. m->cpuvendor = boot_cpu_data.x86_vendor;
  113. m->cpuid = cpuid_eax(1);
  114. m->socketid = cpu_data(m->extcpu).phys_proc_id;
  115. m->apicid = cpu_data(m->extcpu).initial_apicid;
  116. rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
  117. }
  118. DEFINE_PER_CPU(struct mce, injectm);
  119. EXPORT_PER_CPU_SYMBOL_GPL(injectm);
  120. /*
  121. * Lockless MCE logging infrastructure.
  122. * This avoids deadlocks on printk locks without having to break locks. Also
  123. * separate MCEs from kernel messages to avoid bogus bug reports.
  124. */
  125. static struct mce_log mcelog = {
  126. .signature = MCE_LOG_SIGNATURE,
  127. .len = MCE_LOG_LEN,
  128. .recordlen = sizeof(struct mce),
  129. };
  130. void mce_log(struct mce *mce)
  131. {
  132. unsigned next, entry;
  133. /* Emit the trace record: */
  134. trace_mce_record(mce);
  135. if (!mce_gen_pool_add(mce))
  136. irq_work_queue(&mce_irq_work);
  137. mce->finished = 0;
  138. wmb();
  139. for (;;) {
  140. entry = mce_log_get_idx_check(mcelog.next);
  141. for (;;) {
  142. /*
  143. * When the buffer fills up discard new entries.
  144. * Assume that the earlier errors are the more
  145. * interesting ones:
  146. */
  147. if (entry >= MCE_LOG_LEN) {
  148. set_bit(MCE_OVERFLOW,
  149. (unsigned long *)&mcelog.flags);
  150. return;
  151. }
  152. /* Old left over entry. Skip: */
  153. if (mcelog.entry[entry].finished) {
  154. entry++;
  155. continue;
  156. }
  157. break;
  158. }
  159. smp_rmb();
  160. next = entry + 1;
  161. if (cmpxchg(&mcelog.next, entry, next) == entry)
  162. break;
  163. }
  164. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  165. wmb();
  166. mcelog.entry[entry].finished = 1;
  167. wmb();
  168. mce->finished = 1;
  169. set_bit(0, &mce_need_notify);
  170. }
  171. void mce_inject_log(struct mce *m)
  172. {
  173. mutex_lock(&mce_chrdev_read_mutex);
  174. mce_log(m);
  175. mutex_unlock(&mce_chrdev_read_mutex);
  176. }
  177. EXPORT_SYMBOL_GPL(mce_inject_log);
  178. static struct notifier_block mce_srao_nb;
  179. void mce_register_decode_chain(struct notifier_block *nb)
  180. {
  181. /* Ensure SRAO notifier has the highest priority in the decode chain. */
  182. if (nb != &mce_srao_nb && nb->priority == INT_MAX)
  183. nb->priority -= 1;
  184. atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
  185. }
  186. EXPORT_SYMBOL_GPL(mce_register_decode_chain);
  187. void mce_unregister_decode_chain(struct notifier_block *nb)
  188. {
  189. atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
  190. }
  191. EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
  192. static void print_mce(struct mce *m)
  193. {
  194. int ret = 0;
  195. pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
  196. m->extcpu, m->mcgstatus, m->bank, m->status);
  197. if (m->ip) {
  198. pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
  199. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  200. m->cs, m->ip);
  201. if (m->cs == __KERNEL_CS)
  202. print_symbol("{%s}", m->ip);
  203. pr_cont("\n");
  204. }
  205. pr_emerg(HW_ERR "TSC %llx ", m->tsc);
  206. if (m->addr)
  207. pr_cont("ADDR %llx ", m->addr);
  208. if (m->misc)
  209. pr_cont("MISC %llx ", m->misc);
  210. pr_cont("\n");
  211. /*
  212. * Note this output is parsed by external tools and old fields
  213. * should not be changed.
  214. */
  215. pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
  216. m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
  217. cpu_data(m->extcpu).microcode);
  218. /*
  219. * Print out human-readable details about the MCE error,
  220. * (if the CPU has an implementation for that)
  221. */
  222. ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
  223. if (ret == NOTIFY_STOP)
  224. return;
  225. pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
  226. }
  227. #define PANIC_TIMEOUT 5 /* 5 seconds */
  228. static atomic_t mce_panicked;
  229. static int fake_panic;
  230. static atomic_t mce_fake_panicked;
  231. /* Panic in progress. Enable interrupts and wait for final IPI */
  232. static void wait_for_panic(void)
  233. {
  234. long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
  235. preempt_disable();
  236. local_irq_enable();
  237. while (timeout-- > 0)
  238. udelay(1);
  239. if (panic_timeout == 0)
  240. panic_timeout = mca_cfg.panic_timeout;
  241. panic("Panicing machine check CPU died");
  242. }
  243. static void mce_panic(const char *msg, struct mce *final, char *exp)
  244. {
  245. int i, apei_err = 0;
  246. if (!fake_panic) {
  247. /*
  248. * Make sure only one CPU runs in machine check panic
  249. */
  250. if (atomic_inc_return(&mce_panicked) > 1)
  251. wait_for_panic();
  252. barrier();
  253. bust_spinlocks(1);
  254. console_verbose();
  255. } else {
  256. /* Don't log too much for fake panic */
  257. if (atomic_inc_return(&mce_fake_panicked) > 1)
  258. return;
  259. }
  260. /* First print corrected ones that are still unlogged */
  261. for (i = 0; i < MCE_LOG_LEN; i++) {
  262. struct mce *m = &mcelog.entry[i];
  263. if (!(m->status & MCI_STATUS_VAL))
  264. continue;
  265. if (!(m->status & MCI_STATUS_UC)) {
  266. print_mce(m);
  267. if (!apei_err)
  268. apei_err = apei_write_mce(m);
  269. }
  270. }
  271. /* Now print uncorrected but with the final one last */
  272. for (i = 0; i < MCE_LOG_LEN; i++) {
  273. struct mce *m = &mcelog.entry[i];
  274. if (!(m->status & MCI_STATUS_VAL))
  275. continue;
  276. if (!(m->status & MCI_STATUS_UC))
  277. continue;
  278. if (!final || memcmp(m, final, sizeof(struct mce))) {
  279. print_mce(m);
  280. if (!apei_err)
  281. apei_err = apei_write_mce(m);
  282. }
  283. }
  284. if (final) {
  285. print_mce(final);
  286. if (!apei_err)
  287. apei_err = apei_write_mce(final);
  288. }
  289. if (cpu_missing)
  290. pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
  291. if (exp)
  292. pr_emerg(HW_ERR "Machine check: %s\n", exp);
  293. if (!fake_panic) {
  294. if (panic_timeout == 0)
  295. panic_timeout = mca_cfg.panic_timeout;
  296. panic(msg);
  297. } else
  298. pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
  299. }
  300. /* Support code for software error injection */
  301. static int msr_to_offset(u32 msr)
  302. {
  303. unsigned bank = __this_cpu_read(injectm.bank);
  304. if (msr == mca_cfg.rip_msr)
  305. return offsetof(struct mce, ip);
  306. if (msr == MSR_IA32_MCx_STATUS(bank))
  307. return offsetof(struct mce, status);
  308. if (msr == MSR_IA32_MCx_ADDR(bank))
  309. return offsetof(struct mce, addr);
  310. if (msr == MSR_IA32_MCx_MISC(bank))
  311. return offsetof(struct mce, misc);
  312. if (msr == MSR_IA32_MCG_STATUS)
  313. return offsetof(struct mce, mcgstatus);
  314. return -1;
  315. }
  316. /* MSR access wrappers used for error injection */
  317. static u64 mce_rdmsrl(u32 msr)
  318. {
  319. u64 v;
  320. if (__this_cpu_read(injectm.finished)) {
  321. int offset = msr_to_offset(msr);
  322. if (offset < 0)
  323. return 0;
  324. return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
  325. }
  326. if (rdmsrl_safe(msr, &v)) {
  327. WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
  328. /*
  329. * Return zero in case the access faulted. This should
  330. * not happen normally but can happen if the CPU does
  331. * something weird, or if the code is buggy.
  332. */
  333. v = 0;
  334. }
  335. return v;
  336. }
  337. static void mce_wrmsrl(u32 msr, u64 v)
  338. {
  339. if (__this_cpu_read(injectm.finished)) {
  340. int offset = msr_to_offset(msr);
  341. if (offset >= 0)
  342. *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
  343. return;
  344. }
  345. wrmsrl(msr, v);
  346. }
  347. /*
  348. * Collect all global (w.r.t. this processor) status about this machine
  349. * check into our "mce" struct so that we can use it later to assess
  350. * the severity of the problem as we read per-bank specific details.
  351. */
  352. static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
  353. {
  354. mce_setup(m);
  355. m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
  356. if (regs) {
  357. /*
  358. * Get the address of the instruction at the time of
  359. * the machine check error.
  360. */
  361. if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
  362. m->ip = regs->ip;
  363. m->cs = regs->cs;
  364. /*
  365. * When in VM86 mode make the cs look like ring 3
  366. * always. This is a lie, but it's better than passing
  367. * the additional vm86 bit around everywhere.
  368. */
  369. if (v8086_mode(regs))
  370. m->cs |= 3;
  371. }
  372. /* Use accurate RIP reporting if available. */
  373. if (mca_cfg.rip_msr)
  374. m->ip = mce_rdmsrl(mca_cfg.rip_msr);
  375. }
  376. }
  377. int mce_available(struct cpuinfo_x86 *c)
  378. {
  379. if (mca_cfg.disabled)
  380. return 0;
  381. return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
  382. }
  383. static void mce_schedule_work(void)
  384. {
  385. if (!mce_gen_pool_empty() && keventd_up())
  386. schedule_work(&mce_work);
  387. }
  388. static void mce_irq_work_cb(struct irq_work *entry)
  389. {
  390. mce_notify_irq();
  391. mce_schedule_work();
  392. }
  393. static void mce_report_event(struct pt_regs *regs)
  394. {
  395. if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
  396. mce_notify_irq();
  397. /*
  398. * Triggering the work queue here is just an insurance
  399. * policy in case the syscall exit notify handler
  400. * doesn't run soon enough or ends up running on the
  401. * wrong CPU (can happen when audit sleeps)
  402. */
  403. mce_schedule_work();
  404. return;
  405. }
  406. irq_work_queue(&mce_irq_work);
  407. }
  408. /*
  409. * Check if the address reported by the CPU is in a format we can parse.
  410. * It would be possible to add code for most other cases, but all would
  411. * be somewhat complicated (e.g. segment offset would require an instruction
  412. * parser). So only support physical addresses up to page granuality for now.
  413. */
  414. static int mce_usable_address(struct mce *m)
  415. {
  416. if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
  417. return 0;
  418. /* Checks after this one are Intel-specific: */
  419. if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
  420. return 1;
  421. if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
  422. return 0;
  423. if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
  424. return 0;
  425. return 1;
  426. }
  427. static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
  428. void *data)
  429. {
  430. struct mce *mce = (struct mce *)data;
  431. unsigned long pfn;
  432. if (!mce)
  433. return NOTIFY_DONE;
  434. if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
  435. pfn = mce->addr >> PAGE_SHIFT;
  436. memory_failure(pfn, MCE_VECTOR, 0);
  437. }
  438. return NOTIFY_OK;
  439. }
  440. static struct notifier_block mce_srao_nb = {
  441. .notifier_call = srao_decode_notifier,
  442. .priority = INT_MAX,
  443. };
  444. /*
  445. * Read ADDR and MISC registers.
  446. */
  447. static void mce_read_aux(struct mce *m, int i)
  448. {
  449. if (m->status & MCI_STATUS_MISCV)
  450. m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
  451. if (m->status & MCI_STATUS_ADDRV) {
  452. m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
  453. /*
  454. * Mask the reported address by the reported granularity.
  455. */
  456. if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
  457. u8 shift = MCI_MISC_ADDR_LSB(m->misc);
  458. m->addr >>= shift;
  459. m->addr <<= shift;
  460. }
  461. }
  462. }
  463. static bool memory_error(struct mce *m)
  464. {
  465. struct cpuinfo_x86 *c = &boot_cpu_data;
  466. if (c->x86_vendor == X86_VENDOR_AMD) {
  467. /* ErrCodeExt[20:16] */
  468. u8 xec = (m->status >> 16) & 0x1f;
  469. return (xec == 0x0 || xec == 0x8);
  470. } else if (c->x86_vendor == X86_VENDOR_INTEL) {
  471. /*
  472. * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
  473. *
  474. * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
  475. * indicating a memory error. Bit 8 is used for indicating a
  476. * cache hierarchy error. The combination of bit 2 and bit 3
  477. * is used for indicating a `generic' cache hierarchy error
  478. * But we can't just blindly check the above bits, because if
  479. * bit 11 is set, then it is a bus/interconnect error - and
  480. * either way the above bits just gives more detail on what
  481. * bus/interconnect error happened. Note that bit 12 can be
  482. * ignored, as it's the "filter" bit.
  483. */
  484. return (m->status & 0xef80) == BIT(7) ||
  485. (m->status & 0xef00) == BIT(8) ||
  486. (m->status & 0xeffc) == 0xc;
  487. }
  488. return false;
  489. }
  490. DEFINE_PER_CPU(unsigned, mce_poll_count);
  491. /*
  492. * Poll for corrected events or events that happened before reset.
  493. * Those are just logged through /dev/mcelog.
  494. *
  495. * This is executed in standard interrupt context.
  496. *
  497. * Note: spec recommends to panic for fatal unsignalled
  498. * errors here. However this would be quite problematic --
  499. * we would need to reimplement the Monarch handling and
  500. * it would mess up the exclusion between exception handler
  501. * and poll hander -- * so we skip this for now.
  502. * These cases should not happen anyways, or only when the CPU
  503. * is already totally * confused. In this case it's likely it will
  504. * not fully execute the machine check handler either.
  505. */
  506. bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  507. {
  508. bool error_seen = false;
  509. struct mce m;
  510. int severity;
  511. int i;
  512. this_cpu_inc(mce_poll_count);
  513. mce_gather_info(&m, NULL);
  514. for (i = 0; i < mca_cfg.banks; i++) {
  515. if (!mce_banks[i].ctl || !test_bit(i, *b))
  516. continue;
  517. m.misc = 0;
  518. m.addr = 0;
  519. m.bank = i;
  520. m.tsc = 0;
  521. barrier();
  522. m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  523. if (!(m.status & MCI_STATUS_VAL))
  524. continue;
  525. /*
  526. * Uncorrected or signalled events are handled by the exception
  527. * handler when it is enabled, so don't process those here.
  528. *
  529. * TBD do the same check for MCI_STATUS_EN here?
  530. */
  531. if (!(flags & MCP_UC) &&
  532. (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
  533. continue;
  534. error_seen = true;
  535. mce_read_aux(&m, i);
  536. if (!(flags & MCP_TIMESTAMP))
  537. m.tsc = 0;
  538. severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
  539. if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
  540. if (m.status & MCI_STATUS_ADDRV)
  541. m.severity = severity;
  542. /*
  543. * Don't get the IP here because it's unlikely to
  544. * have anything to do with the actual error location.
  545. */
  546. if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
  547. mce_log(&m);
  548. else if (mce_usable_address(&m)) {
  549. /*
  550. * Although we skipped logging this, we still want
  551. * to take action. Add to the pool so the registered
  552. * notifiers will see it.
  553. */
  554. if (!mce_gen_pool_add(&m))
  555. mce_schedule_work();
  556. }
  557. /*
  558. * Clear state for this bank.
  559. */
  560. mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  561. }
  562. /*
  563. * Don't clear MCG_STATUS here because it's only defined for
  564. * exceptions.
  565. */
  566. sync_core();
  567. return error_seen;
  568. }
  569. EXPORT_SYMBOL_GPL(machine_check_poll);
  570. /*
  571. * Do a quick check if any of the events requires a panic.
  572. * This decides if we keep the events around or clear them.
  573. */
  574. static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
  575. struct pt_regs *regs)
  576. {
  577. int i, ret = 0;
  578. char *tmp;
  579. for (i = 0; i < mca_cfg.banks; i++) {
  580. m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  581. if (m->status & MCI_STATUS_VAL) {
  582. __set_bit(i, validp);
  583. if (quirk_no_way_out)
  584. quirk_no_way_out(i, m, regs);
  585. }
  586. if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
  587. *msg = tmp;
  588. ret = 1;
  589. }
  590. }
  591. return ret;
  592. }
  593. /*
  594. * Variable to establish order between CPUs while scanning.
  595. * Each CPU spins initially until executing is equal its number.
  596. */
  597. static atomic_t mce_executing;
  598. /*
  599. * Defines order of CPUs on entry. First CPU becomes Monarch.
  600. */
  601. static atomic_t mce_callin;
  602. /*
  603. * Check if a timeout waiting for other CPUs happened.
  604. */
  605. static int mce_timed_out(u64 *t, const char *msg)
  606. {
  607. /*
  608. * The others already did panic for some reason.
  609. * Bail out like in a timeout.
  610. * rmb() to tell the compiler that system_state
  611. * might have been modified by someone else.
  612. */
  613. rmb();
  614. if (atomic_read(&mce_panicked))
  615. wait_for_panic();
  616. if (!mca_cfg.monarch_timeout)
  617. goto out;
  618. if ((s64)*t < SPINUNIT) {
  619. if (mca_cfg.tolerant <= 1)
  620. mce_panic(msg, NULL, NULL);
  621. cpu_missing = 1;
  622. return 1;
  623. }
  624. *t -= SPINUNIT;
  625. out:
  626. touch_nmi_watchdog();
  627. return 0;
  628. }
  629. /*
  630. * The Monarch's reign. The Monarch is the CPU who entered
  631. * the machine check handler first. It waits for the others to
  632. * raise the exception too and then grades them. When any
  633. * error is fatal panic. Only then let the others continue.
  634. *
  635. * The other CPUs entering the MCE handler will be controlled by the
  636. * Monarch. They are called Subjects.
  637. *
  638. * This way we prevent any potential data corruption in a unrecoverable case
  639. * and also makes sure always all CPU's errors are examined.
  640. *
  641. * Also this detects the case of a machine check event coming from outer
  642. * space (not detected by any CPUs) In this case some external agent wants
  643. * us to shut down, so panic too.
  644. *
  645. * The other CPUs might still decide to panic if the handler happens
  646. * in a unrecoverable place, but in this case the system is in a semi-stable
  647. * state and won't corrupt anything by itself. It's ok to let the others
  648. * continue for a bit first.
  649. *
  650. * All the spin loops have timeouts; when a timeout happens a CPU
  651. * typically elects itself to be Monarch.
  652. */
  653. static void mce_reign(void)
  654. {
  655. int cpu;
  656. struct mce *m = NULL;
  657. int global_worst = 0;
  658. char *msg = NULL;
  659. char *nmsg = NULL;
  660. /*
  661. * This CPU is the Monarch and the other CPUs have run
  662. * through their handlers.
  663. * Grade the severity of the errors of all the CPUs.
  664. */
  665. for_each_possible_cpu(cpu) {
  666. int severity = mce_severity(&per_cpu(mces_seen, cpu),
  667. mca_cfg.tolerant,
  668. &nmsg, true);
  669. if (severity > global_worst) {
  670. msg = nmsg;
  671. global_worst = severity;
  672. m = &per_cpu(mces_seen, cpu);
  673. }
  674. }
  675. /*
  676. * Cannot recover? Panic here then.
  677. * This dumps all the mces in the log buffer and stops the
  678. * other CPUs.
  679. */
  680. if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  681. mce_panic("Fatal machine check", m, msg);
  682. /*
  683. * For UC somewhere we let the CPU who detects it handle it.
  684. * Also must let continue the others, otherwise the handling
  685. * CPU could deadlock on a lock.
  686. */
  687. /*
  688. * No machine check event found. Must be some external
  689. * source or one CPU is hung. Panic.
  690. */
  691. if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
  692. mce_panic("Fatal machine check from unknown source", NULL, NULL);
  693. /*
  694. * Now clear all the mces_seen so that they don't reappear on
  695. * the next mce.
  696. */
  697. for_each_possible_cpu(cpu)
  698. memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
  699. }
  700. static atomic_t global_nwo;
  701. /*
  702. * Start of Monarch synchronization. This waits until all CPUs have
  703. * entered the exception handler and then determines if any of them
  704. * saw a fatal event that requires panic. Then it executes them
  705. * in the entry order.
  706. * TBD double check parallel CPU hotunplug
  707. */
  708. static int mce_start(int *no_way_out)
  709. {
  710. int order;
  711. int cpus = num_online_cpus();
  712. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  713. if (!timeout)
  714. return -1;
  715. atomic_add(*no_way_out, &global_nwo);
  716. /*
  717. * global_nwo should be updated before mce_callin
  718. */
  719. smp_wmb();
  720. order = atomic_inc_return(&mce_callin);
  721. /*
  722. * Wait for everyone.
  723. */
  724. while (atomic_read(&mce_callin) != cpus) {
  725. if (mce_timed_out(&timeout,
  726. "Timeout: Not all CPUs entered broadcast exception handler")) {
  727. atomic_set(&global_nwo, 0);
  728. return -1;
  729. }
  730. ndelay(SPINUNIT);
  731. }
  732. /*
  733. * mce_callin should be read before global_nwo
  734. */
  735. smp_rmb();
  736. if (order == 1) {
  737. /*
  738. * Monarch: Starts executing now, the others wait.
  739. */
  740. atomic_set(&mce_executing, 1);
  741. } else {
  742. /*
  743. * Subject: Now start the scanning loop one by one in
  744. * the original callin order.
  745. * This way when there are any shared banks it will be
  746. * only seen by one CPU before cleared, avoiding duplicates.
  747. */
  748. while (atomic_read(&mce_executing) < order) {
  749. if (mce_timed_out(&timeout,
  750. "Timeout: Subject CPUs unable to finish machine check processing")) {
  751. atomic_set(&global_nwo, 0);
  752. return -1;
  753. }
  754. ndelay(SPINUNIT);
  755. }
  756. }
  757. /*
  758. * Cache the global no_way_out state.
  759. */
  760. *no_way_out = atomic_read(&global_nwo);
  761. return order;
  762. }
  763. /*
  764. * Synchronize between CPUs after main scanning loop.
  765. * This invokes the bulk of the Monarch processing.
  766. */
  767. static int mce_end(int order)
  768. {
  769. int ret = -1;
  770. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  771. if (!timeout)
  772. goto reset;
  773. if (order < 0)
  774. goto reset;
  775. /*
  776. * Allow others to run.
  777. */
  778. atomic_inc(&mce_executing);
  779. if (order == 1) {
  780. /* CHECKME: Can this race with a parallel hotplug? */
  781. int cpus = num_online_cpus();
  782. /*
  783. * Monarch: Wait for everyone to go through their scanning
  784. * loops.
  785. */
  786. while (atomic_read(&mce_executing) <= cpus) {
  787. if (mce_timed_out(&timeout,
  788. "Timeout: Monarch CPU unable to finish machine check processing"))
  789. goto reset;
  790. ndelay(SPINUNIT);
  791. }
  792. mce_reign();
  793. barrier();
  794. ret = 0;
  795. } else {
  796. /*
  797. * Subject: Wait for Monarch to finish.
  798. */
  799. while (atomic_read(&mce_executing) != 0) {
  800. if (mce_timed_out(&timeout,
  801. "Timeout: Monarch CPU did not finish machine check processing"))
  802. goto reset;
  803. ndelay(SPINUNIT);
  804. }
  805. /*
  806. * Don't reset anything. That's done by the Monarch.
  807. */
  808. return 0;
  809. }
  810. /*
  811. * Reset all global state.
  812. */
  813. reset:
  814. atomic_set(&global_nwo, 0);
  815. atomic_set(&mce_callin, 0);
  816. barrier();
  817. /*
  818. * Let others run again.
  819. */
  820. atomic_set(&mce_executing, 0);
  821. return ret;
  822. }
  823. static void mce_clear_state(unsigned long *toclear)
  824. {
  825. int i;
  826. for (i = 0; i < mca_cfg.banks; i++) {
  827. if (test_bit(i, toclear))
  828. mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  829. }
  830. }
  831. /*
  832. * The actual machine check handler. This only handles real
  833. * exceptions when something got corrupted coming in through int 18.
  834. *
  835. * This is executed in NMI context not subject to normal locking rules. This
  836. * implies that most kernel services cannot be safely used. Don't even
  837. * think about putting a printk in there!
  838. *
  839. * On Intel systems this is entered on all CPUs in parallel through
  840. * MCE broadcast. However some CPUs might be broken beyond repair,
  841. * so be always careful when synchronizing with others.
  842. */
  843. void do_machine_check(struct pt_regs *regs, long error_code)
  844. {
  845. struct mca_config *cfg = &mca_cfg;
  846. struct mce m, *final;
  847. int i;
  848. int worst = 0;
  849. int severity;
  850. /*
  851. * Establish sequential order between the CPUs entering the machine
  852. * check handler.
  853. */
  854. int order;
  855. /*
  856. * If no_way_out gets set, there is no safe way to recover from this
  857. * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
  858. */
  859. int no_way_out = 0;
  860. /*
  861. * If kill_it gets set, there might be a way to recover from this
  862. * error.
  863. */
  864. int kill_it = 0;
  865. DECLARE_BITMAP(toclear, MAX_NR_BANKS);
  866. DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
  867. char *msg = "Unknown";
  868. u64 recover_paddr = ~0ull;
  869. int flags = MF_ACTION_REQUIRED;
  870. int lmce = 0;
  871. /* If this CPU is offline, just bail out. */
  872. if (cpu_is_offline(smp_processor_id())) {
  873. u64 mcgstatus;
  874. mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
  875. if (mcgstatus & MCG_STATUS_RIPV) {
  876. mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  877. return;
  878. }
  879. }
  880. ist_enter(regs);
  881. this_cpu_inc(mce_exception_count);
  882. if (!cfg->banks)
  883. goto out;
  884. mce_gather_info(&m, regs);
  885. final = this_cpu_ptr(&mces_seen);
  886. *final = m;
  887. memset(valid_banks, 0, sizeof(valid_banks));
  888. no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
  889. barrier();
  890. /*
  891. * When no restart IP might need to kill or panic.
  892. * Assume the worst for now, but if we find the
  893. * severity is MCE_AR_SEVERITY we have other options.
  894. */
  895. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  896. kill_it = 1;
  897. /*
  898. * Check if this MCE is signaled to only this logical processor
  899. */
  900. if (m.mcgstatus & MCG_STATUS_LMCES)
  901. lmce = 1;
  902. else {
  903. /*
  904. * Go through all the banks in exclusion of the other CPUs.
  905. * This way we don't report duplicated events on shared banks
  906. * because the first one to see it will clear it.
  907. * If this is a Local MCE, then no need to perform rendezvous.
  908. */
  909. order = mce_start(&no_way_out);
  910. }
  911. for (i = 0; i < cfg->banks; i++) {
  912. __clear_bit(i, toclear);
  913. if (!test_bit(i, valid_banks))
  914. continue;
  915. if (!mce_banks[i].ctl)
  916. continue;
  917. m.misc = 0;
  918. m.addr = 0;
  919. m.bank = i;
  920. m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  921. if ((m.status & MCI_STATUS_VAL) == 0)
  922. continue;
  923. /*
  924. * Non uncorrected or non signaled errors are handled by
  925. * machine_check_poll. Leave them alone, unless this panics.
  926. */
  927. if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
  928. !no_way_out)
  929. continue;
  930. /*
  931. * Set taint even when machine check was not enabled.
  932. */
  933. add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
  934. severity = mce_severity(&m, cfg->tolerant, NULL, true);
  935. /*
  936. * When machine check was for corrected/deferred handler don't
  937. * touch, unless we're panicing.
  938. */
  939. if ((severity == MCE_KEEP_SEVERITY ||
  940. severity == MCE_UCNA_SEVERITY) && !no_way_out)
  941. continue;
  942. __set_bit(i, toclear);
  943. if (severity == MCE_NO_SEVERITY) {
  944. /*
  945. * Machine check event was not enabled. Clear, but
  946. * ignore.
  947. */
  948. continue;
  949. }
  950. mce_read_aux(&m, i);
  951. /* assuming valid severity level != 0 */
  952. m.severity = severity;
  953. mce_log(&m);
  954. if (severity > worst) {
  955. *final = m;
  956. worst = severity;
  957. }
  958. }
  959. /* mce_clear_state will clear *final, save locally for use later */
  960. m = *final;
  961. if (!no_way_out)
  962. mce_clear_state(toclear);
  963. /*
  964. * Do most of the synchronization with other CPUs.
  965. * When there's any problem use only local no_way_out state.
  966. */
  967. if (!lmce) {
  968. if (mce_end(order) < 0)
  969. no_way_out = worst >= MCE_PANIC_SEVERITY;
  970. } else {
  971. /*
  972. * Local MCE skipped calling mce_reign()
  973. * If we found a fatal error, we need to panic here.
  974. */
  975. if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  976. mce_panic("Machine check from unknown source",
  977. NULL, NULL);
  978. }
  979. /*
  980. * At insane "tolerant" levels we take no action. Otherwise
  981. * we only die if we have no other choice. For less serious
  982. * issues we try to recover, or limit damage to the current
  983. * process.
  984. */
  985. if (cfg->tolerant < 3) {
  986. if (no_way_out)
  987. mce_panic("Fatal machine check on current CPU", &m, msg);
  988. if (worst == MCE_AR_SEVERITY) {
  989. recover_paddr = m.addr;
  990. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  991. flags |= MF_MUST_KILL;
  992. } else if (kill_it) {
  993. force_sig(SIGBUS, current);
  994. }
  995. }
  996. if (worst > 0)
  997. mce_report_event(regs);
  998. mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  999. out:
  1000. sync_core();
  1001. if (recover_paddr == ~0ull)
  1002. goto done;
  1003. pr_err("Uncorrected hardware memory error in user-access at %llx",
  1004. recover_paddr);
  1005. /*
  1006. * We must call memory_failure() here even if the current process is
  1007. * doomed. We still need to mark the page as poisoned and alert any
  1008. * other users of the page.
  1009. */
  1010. ist_begin_non_atomic(regs);
  1011. local_irq_enable();
  1012. if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
  1013. pr_err("Memory error not recovered");
  1014. force_sig(SIGBUS, current);
  1015. }
  1016. local_irq_disable();
  1017. ist_end_non_atomic();
  1018. done:
  1019. ist_exit(regs);
  1020. }
  1021. EXPORT_SYMBOL_GPL(do_machine_check);
  1022. #ifndef CONFIG_MEMORY_FAILURE
  1023. int memory_failure(unsigned long pfn, int vector, int flags)
  1024. {
  1025. /* mce_severity() should not hand us an ACTION_REQUIRED error */
  1026. BUG_ON(flags & MF_ACTION_REQUIRED);
  1027. pr_err("Uncorrected memory error in page 0x%lx ignored\n"
  1028. "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
  1029. pfn);
  1030. return 0;
  1031. }
  1032. #endif
  1033. /*
  1034. * Action optional processing happens here (picking up
  1035. * from the list of faulting pages that do_machine_check()
  1036. * placed into the genpool).
  1037. */
  1038. static void mce_process_work(struct work_struct *dummy)
  1039. {
  1040. mce_gen_pool_process();
  1041. }
  1042. #ifdef CONFIG_X86_MCE_INTEL
  1043. /***
  1044. * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
  1045. * @cpu: The CPU on which the event occurred.
  1046. * @status: Event status information
  1047. *
  1048. * This function should be called by the thermal interrupt after the
  1049. * event has been processed and the decision was made to log the event
  1050. * further.
  1051. *
  1052. * The status parameter will be saved to the 'status' field of 'struct mce'
  1053. * and historically has been the register value of the
  1054. * MSR_IA32_THERMAL_STATUS (Intel) msr.
  1055. */
  1056. void mce_log_therm_throt_event(__u64 status)
  1057. {
  1058. struct mce m;
  1059. mce_setup(&m);
  1060. m.bank = MCE_THERMAL_BANK;
  1061. m.status = status;
  1062. mce_log(&m);
  1063. }
  1064. #endif /* CONFIG_X86_MCE_INTEL */
  1065. /*
  1066. * Periodic polling timer for "silent" machine check errors. If the
  1067. * poller finds an MCE, poll 2x faster. When the poller finds no more
  1068. * errors, poll 2x slower (up to check_interval seconds).
  1069. */
  1070. static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
  1071. static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  1072. static DEFINE_PER_CPU(struct timer_list, mce_timer);
  1073. static unsigned long mce_adjust_timer_default(unsigned long interval)
  1074. {
  1075. return interval;
  1076. }
  1077. static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
  1078. static void __restart_timer(struct timer_list *t, unsigned long interval)
  1079. {
  1080. unsigned long when = jiffies + interval;
  1081. unsigned long flags;
  1082. local_irq_save(flags);
  1083. if (timer_pending(t)) {
  1084. if (time_before(when, t->expires))
  1085. mod_timer_pinned(t, when);
  1086. } else {
  1087. t->expires = round_jiffies(when);
  1088. add_timer_on(t, smp_processor_id());
  1089. }
  1090. local_irq_restore(flags);
  1091. }
  1092. static void mce_timer_fn(unsigned long data)
  1093. {
  1094. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1095. int cpu = smp_processor_id();
  1096. unsigned long iv;
  1097. WARN_ON(cpu != data);
  1098. iv = __this_cpu_read(mce_next_interval);
  1099. if (mce_available(this_cpu_ptr(&cpu_info))) {
  1100. machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
  1101. if (mce_intel_cmci_poll()) {
  1102. iv = mce_adjust_timer(iv);
  1103. goto done;
  1104. }
  1105. }
  1106. /*
  1107. * Alert userspace if needed. If we logged an MCE, reduce the polling
  1108. * interval, otherwise increase the polling interval.
  1109. */
  1110. if (mce_notify_irq())
  1111. iv = max(iv / 2, (unsigned long) HZ/100);
  1112. else
  1113. iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
  1114. done:
  1115. __this_cpu_write(mce_next_interval, iv);
  1116. __restart_timer(t, iv);
  1117. }
  1118. /*
  1119. * Ensure that the timer is firing in @interval from now.
  1120. */
  1121. void mce_timer_kick(unsigned long interval)
  1122. {
  1123. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1124. unsigned long iv = __this_cpu_read(mce_next_interval);
  1125. __restart_timer(t, interval);
  1126. if (interval < iv)
  1127. __this_cpu_write(mce_next_interval, interval);
  1128. }
  1129. /* Must not be called in IRQ context where del_timer_sync() can deadlock */
  1130. static void mce_timer_delete_all(void)
  1131. {
  1132. int cpu;
  1133. for_each_online_cpu(cpu)
  1134. del_timer_sync(&per_cpu(mce_timer, cpu));
  1135. }
  1136. static void mce_do_trigger(struct work_struct *work)
  1137. {
  1138. call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
  1139. }
  1140. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  1141. /*
  1142. * Notify the user(s) about new machine check events.
  1143. * Can be called from interrupt context, but not from machine check/NMI
  1144. * context.
  1145. */
  1146. int mce_notify_irq(void)
  1147. {
  1148. /* Not more than two messages every minute */
  1149. static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  1150. if (test_and_clear_bit(0, &mce_need_notify)) {
  1151. /* wake processes polling /dev/mcelog */
  1152. wake_up_interruptible(&mce_chrdev_wait);
  1153. if (mce_helper[0])
  1154. schedule_work(&mce_trigger_work);
  1155. if (__ratelimit(&ratelimit))
  1156. pr_info(HW_ERR "Machine check events logged\n");
  1157. return 1;
  1158. }
  1159. return 0;
  1160. }
  1161. EXPORT_SYMBOL_GPL(mce_notify_irq);
  1162. static int __mcheck_cpu_mce_banks_init(void)
  1163. {
  1164. int i;
  1165. u8 num_banks = mca_cfg.banks;
  1166. mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
  1167. if (!mce_banks)
  1168. return -ENOMEM;
  1169. for (i = 0; i < num_banks; i++) {
  1170. struct mce_bank *b = &mce_banks[i];
  1171. b->ctl = -1ULL;
  1172. b->init = 1;
  1173. }
  1174. return 0;
  1175. }
  1176. /*
  1177. * Initialize Machine Checks for a CPU.
  1178. */
  1179. static int __mcheck_cpu_cap_init(void)
  1180. {
  1181. unsigned b;
  1182. u64 cap;
  1183. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1184. b = cap & MCG_BANKCNT_MASK;
  1185. if (!mca_cfg.banks)
  1186. pr_info("CPU supports %d MCE banks\n", b);
  1187. if (b > MAX_NR_BANKS) {
  1188. pr_warn("Using only %u machine check banks out of %u\n",
  1189. MAX_NR_BANKS, b);
  1190. b = MAX_NR_BANKS;
  1191. }
  1192. /* Don't support asymmetric configurations today */
  1193. WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
  1194. mca_cfg.banks = b;
  1195. if (!mce_banks) {
  1196. int err = __mcheck_cpu_mce_banks_init();
  1197. if (err)
  1198. return err;
  1199. }
  1200. /* Use accurate RIP reporting if available. */
  1201. if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
  1202. mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
  1203. if (cap & MCG_SER_P)
  1204. mca_cfg.ser = true;
  1205. return 0;
  1206. }
  1207. static void __mcheck_cpu_init_generic(void)
  1208. {
  1209. enum mcp_flags m_fl = 0;
  1210. mce_banks_t all_banks;
  1211. u64 cap;
  1212. int i;
  1213. if (!mca_cfg.bootlog)
  1214. m_fl = MCP_DONTLOG;
  1215. /*
  1216. * Log the machine checks left over from the previous reset.
  1217. */
  1218. bitmap_fill(all_banks, MAX_NR_BANKS);
  1219. machine_check_poll(MCP_UC | m_fl, &all_banks);
  1220. cr4_set_bits(X86_CR4_MCE);
  1221. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1222. if (cap & MCG_CTL_P)
  1223. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  1224. for (i = 0; i < mca_cfg.banks; i++) {
  1225. struct mce_bank *b = &mce_banks[i];
  1226. if (!b->init)
  1227. continue;
  1228. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  1229. wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  1230. }
  1231. }
  1232. /*
  1233. * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
  1234. * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
  1235. * Vol 3B Table 15-20). But this confuses both the code that determines
  1236. * whether the machine check occurred in kernel or user mode, and also
  1237. * the severity assessment code. Pretend that EIPV was set, and take the
  1238. * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
  1239. */
  1240. static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
  1241. {
  1242. if (bank != 0)
  1243. return;
  1244. if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
  1245. return;
  1246. if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
  1247. MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
  1248. MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
  1249. MCACOD)) !=
  1250. (MCI_STATUS_UC|MCI_STATUS_EN|
  1251. MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
  1252. MCI_STATUS_AR|MCACOD_INSTR))
  1253. return;
  1254. m->mcgstatus |= MCG_STATUS_EIPV;
  1255. m->ip = regs->ip;
  1256. m->cs = regs->cs;
  1257. }
  1258. /* Add per CPU specific workarounds here */
  1259. static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
  1260. {
  1261. struct mca_config *cfg = &mca_cfg;
  1262. if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
  1263. pr_info("unknown CPU type - not enabling MCE support\n");
  1264. return -EOPNOTSUPP;
  1265. }
  1266. /* This should be disabled by the BIOS, but isn't always */
  1267. if (c->x86_vendor == X86_VENDOR_AMD) {
  1268. if (c->x86 == 15 && cfg->banks > 4) {
  1269. /*
  1270. * disable GART TBL walk error reporting, which
  1271. * trips off incorrectly with the IOMMU & 3ware
  1272. * & Cerberus:
  1273. */
  1274. clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
  1275. }
  1276. if (c->x86 <= 17 && cfg->bootlog < 0) {
  1277. /*
  1278. * Lots of broken BIOS around that don't clear them
  1279. * by default and leave crap in there. Don't log:
  1280. */
  1281. cfg->bootlog = 0;
  1282. }
  1283. /*
  1284. * Various K7s with broken bank 0 around. Always disable
  1285. * by default.
  1286. */
  1287. if (c->x86 == 6 && cfg->banks > 0)
  1288. mce_banks[0].ctl = 0;
  1289. /*
  1290. * overflow_recov is supported for F15h Models 00h-0fh
  1291. * even though we don't have a CPUID bit for it.
  1292. */
  1293. if (c->x86 == 0x15 && c->x86_model <= 0xf)
  1294. mce_flags.overflow_recov = 1;
  1295. /*
  1296. * Turn off MC4_MISC thresholding banks on those models since
  1297. * they're not supported there.
  1298. */
  1299. if (c->x86 == 0x15 &&
  1300. (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
  1301. int i;
  1302. u64 hwcr;
  1303. bool need_toggle;
  1304. u32 msrs[] = {
  1305. 0x00000413, /* MC4_MISC0 */
  1306. 0xc0000408, /* MC4_MISC1 */
  1307. };
  1308. rdmsrl(MSR_K7_HWCR, hwcr);
  1309. /* McStatusWrEn has to be set */
  1310. need_toggle = !(hwcr & BIT(18));
  1311. if (need_toggle)
  1312. wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
  1313. /* Clear CntP bit safely */
  1314. for (i = 0; i < ARRAY_SIZE(msrs); i++)
  1315. msr_clear_bit(msrs[i], 62);
  1316. /* restore old settings */
  1317. if (need_toggle)
  1318. wrmsrl(MSR_K7_HWCR, hwcr);
  1319. }
  1320. }
  1321. if (c->x86_vendor == X86_VENDOR_INTEL) {
  1322. /*
  1323. * SDM documents that on family 6 bank 0 should not be written
  1324. * because it aliases to another special BIOS controlled
  1325. * register.
  1326. * But it's not aliased anymore on model 0x1a+
  1327. * Don't ignore bank 0 completely because there could be a
  1328. * valid event later, merely don't write CTL0.
  1329. */
  1330. if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
  1331. mce_banks[0].init = 0;
  1332. /*
  1333. * All newer Intel systems support MCE broadcasting. Enable
  1334. * synchronization with a one second timeout.
  1335. */
  1336. if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
  1337. cfg->monarch_timeout < 0)
  1338. cfg->monarch_timeout = USEC_PER_SEC;
  1339. /*
  1340. * There are also broken BIOSes on some Pentium M and
  1341. * earlier systems:
  1342. */
  1343. if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
  1344. cfg->bootlog = 0;
  1345. if (c->x86 == 6 && c->x86_model == 45)
  1346. quirk_no_way_out = quirk_sandybridge_ifu;
  1347. }
  1348. if (cfg->monarch_timeout < 0)
  1349. cfg->monarch_timeout = 0;
  1350. if (cfg->bootlog != 0)
  1351. cfg->panic_timeout = 30;
  1352. return 0;
  1353. }
  1354. static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
  1355. {
  1356. if (c->x86 != 5)
  1357. return 0;
  1358. switch (c->x86_vendor) {
  1359. case X86_VENDOR_INTEL:
  1360. intel_p5_mcheck_init(c);
  1361. return 1;
  1362. break;
  1363. case X86_VENDOR_CENTAUR:
  1364. winchip_mcheck_init(c);
  1365. return 1;
  1366. break;
  1367. default:
  1368. return 0;
  1369. }
  1370. return 0;
  1371. }
  1372. static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
  1373. {
  1374. switch (c->x86_vendor) {
  1375. case X86_VENDOR_INTEL:
  1376. mce_intel_feature_init(c);
  1377. mce_adjust_timer = cmci_intel_adjust_timer;
  1378. break;
  1379. case X86_VENDOR_AMD: {
  1380. u32 ebx = cpuid_ebx(0x80000007);
  1381. mce_amd_feature_init(c);
  1382. mce_flags.overflow_recov = !!(ebx & BIT(0));
  1383. mce_flags.succor = !!(ebx & BIT(1));
  1384. mce_flags.smca = !!(ebx & BIT(3));
  1385. break;
  1386. }
  1387. default:
  1388. break;
  1389. }
  1390. }
  1391. static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
  1392. {
  1393. switch (c->x86_vendor) {
  1394. case X86_VENDOR_INTEL:
  1395. mce_intel_feature_clear(c);
  1396. break;
  1397. default:
  1398. break;
  1399. }
  1400. }
  1401. static void mce_start_timer(unsigned int cpu, struct timer_list *t)
  1402. {
  1403. unsigned long iv = check_interval * HZ;
  1404. if (mca_cfg.ignore_ce || !iv)
  1405. return;
  1406. per_cpu(mce_next_interval, cpu) = iv;
  1407. t->expires = round_jiffies(jiffies + iv);
  1408. add_timer_on(t, cpu);
  1409. }
  1410. static void __mcheck_cpu_init_timer(void)
  1411. {
  1412. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1413. unsigned int cpu = smp_processor_id();
  1414. setup_timer(t, mce_timer_fn, cpu);
  1415. mce_start_timer(cpu, t);
  1416. }
  1417. /* Handle unconfigured int18 (should never happen) */
  1418. static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  1419. {
  1420. pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
  1421. smp_processor_id());
  1422. }
  1423. /* Call the installed machine check handler for this CPU setup. */
  1424. void (*machine_check_vector)(struct pt_regs *, long error_code) =
  1425. unexpected_machine_check;
  1426. /*
  1427. * Called for each booted CPU to set up machine checks.
  1428. * Must be called with preempt off:
  1429. */
  1430. void mcheck_cpu_init(struct cpuinfo_x86 *c)
  1431. {
  1432. if (mca_cfg.disabled)
  1433. return;
  1434. if (__mcheck_cpu_ancient_init(c))
  1435. return;
  1436. if (!mce_available(c))
  1437. return;
  1438. if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
  1439. mca_cfg.disabled = true;
  1440. return;
  1441. }
  1442. if (mce_gen_pool_init()) {
  1443. mca_cfg.disabled = true;
  1444. pr_emerg("Couldn't allocate MCE records pool!\n");
  1445. return;
  1446. }
  1447. machine_check_vector = do_machine_check;
  1448. __mcheck_cpu_init_generic();
  1449. __mcheck_cpu_init_vendor(c);
  1450. __mcheck_cpu_init_timer();
  1451. }
  1452. /*
  1453. * Called for each booted CPU to clear some machine checks opt-ins
  1454. */
  1455. void mcheck_cpu_clear(struct cpuinfo_x86 *c)
  1456. {
  1457. if (mca_cfg.disabled)
  1458. return;
  1459. if (!mce_available(c))
  1460. return;
  1461. /*
  1462. * Possibly to clear general settings generic to x86
  1463. * __mcheck_cpu_clear_generic(c);
  1464. */
  1465. __mcheck_cpu_clear_vendor(c);
  1466. }
  1467. /*
  1468. * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  1469. */
  1470. static DEFINE_SPINLOCK(mce_chrdev_state_lock);
  1471. static int mce_chrdev_open_count; /* #times opened */
  1472. static int mce_chrdev_open_exclu; /* already open exclusive? */
  1473. static int mce_chrdev_open(struct inode *inode, struct file *file)
  1474. {
  1475. spin_lock(&mce_chrdev_state_lock);
  1476. if (mce_chrdev_open_exclu ||
  1477. (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
  1478. spin_unlock(&mce_chrdev_state_lock);
  1479. return -EBUSY;
  1480. }
  1481. if (file->f_flags & O_EXCL)
  1482. mce_chrdev_open_exclu = 1;
  1483. mce_chrdev_open_count++;
  1484. spin_unlock(&mce_chrdev_state_lock);
  1485. return nonseekable_open(inode, file);
  1486. }
  1487. static int mce_chrdev_release(struct inode *inode, struct file *file)
  1488. {
  1489. spin_lock(&mce_chrdev_state_lock);
  1490. mce_chrdev_open_count--;
  1491. mce_chrdev_open_exclu = 0;
  1492. spin_unlock(&mce_chrdev_state_lock);
  1493. return 0;
  1494. }
  1495. static void collect_tscs(void *data)
  1496. {
  1497. unsigned long *cpu_tsc = (unsigned long *)data;
  1498. cpu_tsc[smp_processor_id()] = rdtsc();
  1499. }
  1500. static int mce_apei_read_done;
  1501. /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
  1502. static int __mce_read_apei(char __user **ubuf, size_t usize)
  1503. {
  1504. int rc;
  1505. u64 record_id;
  1506. struct mce m;
  1507. if (usize < sizeof(struct mce))
  1508. return -EINVAL;
  1509. rc = apei_read_mce(&m, &record_id);
  1510. /* Error or no more MCE record */
  1511. if (rc <= 0) {
  1512. mce_apei_read_done = 1;
  1513. /*
  1514. * When ERST is disabled, mce_chrdev_read() should return
  1515. * "no record" instead of "no device."
  1516. */
  1517. if (rc == -ENODEV)
  1518. return 0;
  1519. return rc;
  1520. }
  1521. rc = -EFAULT;
  1522. if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
  1523. return rc;
  1524. /*
  1525. * In fact, we should have cleared the record after that has
  1526. * been flushed to the disk or sent to network in
  1527. * /sbin/mcelog, but we have no interface to support that now,
  1528. * so just clear it to avoid duplication.
  1529. */
  1530. rc = apei_clear_mce(record_id);
  1531. if (rc) {
  1532. mce_apei_read_done = 1;
  1533. return rc;
  1534. }
  1535. *ubuf += sizeof(struct mce);
  1536. return 0;
  1537. }
  1538. static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
  1539. size_t usize, loff_t *off)
  1540. {
  1541. char __user *buf = ubuf;
  1542. unsigned long *cpu_tsc;
  1543. unsigned prev, next;
  1544. int i, err;
  1545. cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
  1546. if (!cpu_tsc)
  1547. return -ENOMEM;
  1548. mutex_lock(&mce_chrdev_read_mutex);
  1549. if (!mce_apei_read_done) {
  1550. err = __mce_read_apei(&buf, usize);
  1551. if (err || buf != ubuf)
  1552. goto out;
  1553. }
  1554. next = mce_log_get_idx_check(mcelog.next);
  1555. /* Only supports full reads right now */
  1556. err = -EINVAL;
  1557. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
  1558. goto out;
  1559. err = 0;
  1560. prev = 0;
  1561. do {
  1562. for (i = prev; i < next; i++) {
  1563. unsigned long start = jiffies;
  1564. struct mce *m = &mcelog.entry[i];
  1565. while (!m->finished) {
  1566. if (time_after_eq(jiffies, start + 2)) {
  1567. memset(m, 0, sizeof(*m));
  1568. goto timeout;
  1569. }
  1570. cpu_relax();
  1571. }
  1572. smp_rmb();
  1573. err |= copy_to_user(buf, m, sizeof(*m));
  1574. buf += sizeof(*m);
  1575. timeout:
  1576. ;
  1577. }
  1578. memset(mcelog.entry + prev, 0,
  1579. (next - prev) * sizeof(struct mce));
  1580. prev = next;
  1581. next = cmpxchg(&mcelog.next, prev, 0);
  1582. } while (next != prev);
  1583. synchronize_sched();
  1584. /*
  1585. * Collect entries that were still getting written before the
  1586. * synchronize.
  1587. */
  1588. on_each_cpu(collect_tscs, cpu_tsc, 1);
  1589. for (i = next; i < MCE_LOG_LEN; i++) {
  1590. struct mce *m = &mcelog.entry[i];
  1591. if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
  1592. err |= copy_to_user(buf, m, sizeof(*m));
  1593. smp_rmb();
  1594. buf += sizeof(*m);
  1595. memset(m, 0, sizeof(*m));
  1596. }
  1597. }
  1598. if (err)
  1599. err = -EFAULT;
  1600. out:
  1601. mutex_unlock(&mce_chrdev_read_mutex);
  1602. kfree(cpu_tsc);
  1603. return err ? err : buf - ubuf;
  1604. }
  1605. static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
  1606. {
  1607. poll_wait(file, &mce_chrdev_wait, wait);
  1608. if (READ_ONCE(mcelog.next))
  1609. return POLLIN | POLLRDNORM;
  1610. if (!mce_apei_read_done && apei_check_mce())
  1611. return POLLIN | POLLRDNORM;
  1612. return 0;
  1613. }
  1614. static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
  1615. unsigned long arg)
  1616. {
  1617. int __user *p = (int __user *)arg;
  1618. if (!capable(CAP_SYS_ADMIN))
  1619. return -EPERM;
  1620. switch (cmd) {
  1621. case MCE_GET_RECORD_LEN:
  1622. return put_user(sizeof(struct mce), p);
  1623. case MCE_GET_LOG_LEN:
  1624. return put_user(MCE_LOG_LEN, p);
  1625. case MCE_GETCLEAR_FLAGS: {
  1626. unsigned flags;
  1627. do {
  1628. flags = mcelog.flags;
  1629. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  1630. return put_user(flags, p);
  1631. }
  1632. default:
  1633. return -ENOTTY;
  1634. }
  1635. }
  1636. static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
  1637. size_t usize, loff_t *off);
  1638. void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
  1639. const char __user *ubuf,
  1640. size_t usize, loff_t *off))
  1641. {
  1642. mce_write = fn;
  1643. }
  1644. EXPORT_SYMBOL_GPL(register_mce_write_callback);
  1645. static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
  1646. size_t usize, loff_t *off)
  1647. {
  1648. if (mce_write)
  1649. return mce_write(filp, ubuf, usize, off);
  1650. else
  1651. return -EINVAL;
  1652. }
  1653. static const struct file_operations mce_chrdev_ops = {
  1654. .open = mce_chrdev_open,
  1655. .release = mce_chrdev_release,
  1656. .read = mce_chrdev_read,
  1657. .write = mce_chrdev_write,
  1658. .poll = mce_chrdev_poll,
  1659. .unlocked_ioctl = mce_chrdev_ioctl,
  1660. .llseek = no_llseek,
  1661. };
  1662. static struct miscdevice mce_chrdev_device = {
  1663. MISC_MCELOG_MINOR,
  1664. "mcelog",
  1665. &mce_chrdev_ops,
  1666. };
  1667. static void __mce_disable_bank(void *arg)
  1668. {
  1669. int bank = *((int *)arg);
  1670. __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
  1671. cmci_disable_bank(bank);
  1672. }
  1673. void mce_disable_bank(int bank)
  1674. {
  1675. if (bank >= mca_cfg.banks) {
  1676. pr_warn(FW_BUG
  1677. "Ignoring request to disable invalid MCA bank %d.\n",
  1678. bank);
  1679. return;
  1680. }
  1681. set_bit(bank, mce_banks_ce_disabled);
  1682. on_each_cpu(__mce_disable_bank, &bank, 1);
  1683. }
  1684. /*
  1685. * mce=off Disables machine check
  1686. * mce=no_cmci Disables CMCI
  1687. * mce=no_lmce Disables LMCE
  1688. * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
  1689. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  1690. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
  1691. * monarchtimeout is how long to wait for other CPUs on machine
  1692. * check, or 0 to not wait
  1693. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  1694. * mce=nobootlog Don't log MCEs from before booting.
  1695. * mce=bios_cmci_threshold Don't program the CMCI threshold
  1696. */
  1697. static int __init mcheck_enable(char *str)
  1698. {
  1699. struct mca_config *cfg = &mca_cfg;
  1700. if (*str == 0) {
  1701. enable_p5_mce();
  1702. return 1;
  1703. }
  1704. if (*str == '=')
  1705. str++;
  1706. if (!strcmp(str, "off"))
  1707. cfg->disabled = true;
  1708. else if (!strcmp(str, "no_cmci"))
  1709. cfg->cmci_disabled = true;
  1710. else if (!strcmp(str, "no_lmce"))
  1711. cfg->lmce_disabled = true;
  1712. else if (!strcmp(str, "dont_log_ce"))
  1713. cfg->dont_log_ce = true;
  1714. else if (!strcmp(str, "ignore_ce"))
  1715. cfg->ignore_ce = true;
  1716. else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
  1717. cfg->bootlog = (str[0] == 'b');
  1718. else if (!strcmp(str, "bios_cmci_threshold"))
  1719. cfg->bios_cmci_threshold = true;
  1720. else if (isdigit(str[0])) {
  1721. if (get_option(&str, &cfg->tolerant) == 2)
  1722. get_option(&str, &(cfg->monarch_timeout));
  1723. } else {
  1724. pr_info("mce argument %s ignored. Please use /sys\n", str);
  1725. return 0;
  1726. }
  1727. return 1;
  1728. }
  1729. __setup("mce", mcheck_enable);
  1730. int __init mcheck_init(void)
  1731. {
  1732. mcheck_intel_therm_init();
  1733. mce_register_decode_chain(&mce_srao_nb);
  1734. mcheck_vendor_init_severity();
  1735. INIT_WORK(&mce_work, mce_process_work);
  1736. init_irq_work(&mce_irq_work, mce_irq_work_cb);
  1737. return 0;
  1738. }
  1739. /*
  1740. * mce_syscore: PM support
  1741. */
  1742. /*
  1743. * Disable machine checks on suspend and shutdown. We can't really handle
  1744. * them later.
  1745. */
  1746. static void mce_disable_error_reporting(void)
  1747. {
  1748. int i;
  1749. for (i = 0; i < mca_cfg.banks; i++) {
  1750. struct mce_bank *b = &mce_banks[i];
  1751. if (b->init)
  1752. wrmsrl(MSR_IA32_MCx_CTL(i), 0);
  1753. }
  1754. return;
  1755. }
  1756. static void vendor_disable_error_reporting(void)
  1757. {
  1758. /*
  1759. * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
  1760. * Disabling them for just a single offlined CPU is bad, since it will
  1761. * inhibit reporting for all shared resources on the socket like the
  1762. * last level cache (LLC), the integrated memory controller (iMC), etc.
  1763. */
  1764. if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  1765. return;
  1766. mce_disable_error_reporting();
  1767. }
  1768. static int mce_syscore_suspend(void)
  1769. {
  1770. vendor_disable_error_reporting();
  1771. return 0;
  1772. }
  1773. static void mce_syscore_shutdown(void)
  1774. {
  1775. vendor_disable_error_reporting();
  1776. }
  1777. /*
  1778. * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
  1779. * Only one CPU is active at this time, the others get re-added later using
  1780. * CPU hotplug:
  1781. */
  1782. static void mce_syscore_resume(void)
  1783. {
  1784. __mcheck_cpu_init_generic();
  1785. __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
  1786. }
  1787. static struct syscore_ops mce_syscore_ops = {
  1788. .suspend = mce_syscore_suspend,
  1789. .shutdown = mce_syscore_shutdown,
  1790. .resume = mce_syscore_resume,
  1791. };
  1792. /*
  1793. * mce_device: Sysfs support
  1794. */
  1795. static void mce_cpu_restart(void *data)
  1796. {
  1797. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1798. return;
  1799. __mcheck_cpu_init_generic();
  1800. __mcheck_cpu_init_timer();
  1801. }
  1802. /* Reinit MCEs after user configuration changes */
  1803. static void mce_restart(void)
  1804. {
  1805. mce_timer_delete_all();
  1806. on_each_cpu(mce_cpu_restart, NULL, 1);
  1807. }
  1808. /* Toggle features for corrected errors */
  1809. static void mce_disable_cmci(void *data)
  1810. {
  1811. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1812. return;
  1813. cmci_clear();
  1814. }
  1815. static void mce_enable_ce(void *all)
  1816. {
  1817. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1818. return;
  1819. cmci_reenable();
  1820. cmci_recheck();
  1821. if (all)
  1822. __mcheck_cpu_init_timer();
  1823. }
  1824. static struct bus_type mce_subsys = {
  1825. .name = "machinecheck",
  1826. .dev_name = "machinecheck",
  1827. };
  1828. DEFINE_PER_CPU(struct device *, mce_device);
  1829. void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
  1830. static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
  1831. {
  1832. return container_of(attr, struct mce_bank, attr);
  1833. }
  1834. static ssize_t show_bank(struct device *s, struct device_attribute *attr,
  1835. char *buf)
  1836. {
  1837. return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
  1838. }
  1839. static ssize_t set_bank(struct device *s, struct device_attribute *attr,
  1840. const char *buf, size_t size)
  1841. {
  1842. u64 new;
  1843. if (kstrtou64(buf, 0, &new) < 0)
  1844. return -EINVAL;
  1845. attr_to_bank(attr)->ctl = new;
  1846. mce_restart();
  1847. return size;
  1848. }
  1849. static ssize_t
  1850. show_trigger(struct device *s, struct device_attribute *attr, char *buf)
  1851. {
  1852. strcpy(buf, mce_helper);
  1853. strcat(buf, "\n");
  1854. return strlen(mce_helper) + 1;
  1855. }
  1856. static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
  1857. const char *buf, size_t siz)
  1858. {
  1859. char *p;
  1860. strncpy(mce_helper, buf, sizeof(mce_helper));
  1861. mce_helper[sizeof(mce_helper)-1] = 0;
  1862. p = strchr(mce_helper, '\n');
  1863. if (p)
  1864. *p = 0;
  1865. return strlen(mce_helper) + !!p;
  1866. }
  1867. static ssize_t set_ignore_ce(struct device *s,
  1868. struct device_attribute *attr,
  1869. const char *buf, size_t size)
  1870. {
  1871. u64 new;
  1872. if (kstrtou64(buf, 0, &new) < 0)
  1873. return -EINVAL;
  1874. if (mca_cfg.ignore_ce ^ !!new) {
  1875. if (new) {
  1876. /* disable ce features */
  1877. mce_timer_delete_all();
  1878. on_each_cpu(mce_disable_cmci, NULL, 1);
  1879. mca_cfg.ignore_ce = true;
  1880. } else {
  1881. /* enable ce features */
  1882. mca_cfg.ignore_ce = false;
  1883. on_each_cpu(mce_enable_ce, (void *)1, 1);
  1884. }
  1885. }
  1886. return size;
  1887. }
  1888. static ssize_t set_cmci_disabled(struct device *s,
  1889. struct device_attribute *attr,
  1890. const char *buf, size_t size)
  1891. {
  1892. u64 new;
  1893. if (kstrtou64(buf, 0, &new) < 0)
  1894. return -EINVAL;
  1895. if (mca_cfg.cmci_disabled ^ !!new) {
  1896. if (new) {
  1897. /* disable cmci */
  1898. on_each_cpu(mce_disable_cmci, NULL, 1);
  1899. mca_cfg.cmci_disabled = true;
  1900. } else {
  1901. /* enable cmci */
  1902. mca_cfg.cmci_disabled = false;
  1903. on_each_cpu(mce_enable_ce, NULL, 1);
  1904. }
  1905. }
  1906. return size;
  1907. }
  1908. static ssize_t store_int_with_restart(struct device *s,
  1909. struct device_attribute *attr,
  1910. const char *buf, size_t size)
  1911. {
  1912. ssize_t ret = device_store_int(s, attr, buf, size);
  1913. mce_restart();
  1914. return ret;
  1915. }
  1916. static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
  1917. static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
  1918. static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
  1919. static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
  1920. static struct dev_ext_attribute dev_attr_check_interval = {
  1921. __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
  1922. &check_interval
  1923. };
  1924. static struct dev_ext_attribute dev_attr_ignore_ce = {
  1925. __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
  1926. &mca_cfg.ignore_ce
  1927. };
  1928. static struct dev_ext_attribute dev_attr_cmci_disabled = {
  1929. __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
  1930. &mca_cfg.cmci_disabled
  1931. };
  1932. static struct device_attribute *mce_device_attrs[] = {
  1933. &dev_attr_tolerant.attr,
  1934. &dev_attr_check_interval.attr,
  1935. &dev_attr_trigger,
  1936. &dev_attr_monarch_timeout.attr,
  1937. &dev_attr_dont_log_ce.attr,
  1938. &dev_attr_ignore_ce.attr,
  1939. &dev_attr_cmci_disabled.attr,
  1940. NULL
  1941. };
  1942. static cpumask_var_t mce_device_initialized;
  1943. static void mce_device_release(struct device *dev)
  1944. {
  1945. kfree(dev);
  1946. }
  1947. /* Per cpu device init. All of the cpus still share the same ctrl bank: */
  1948. static int mce_device_create(unsigned int cpu)
  1949. {
  1950. struct device *dev;
  1951. int err;
  1952. int i, j;
  1953. if (!mce_available(&boot_cpu_data))
  1954. return -EIO;
  1955. dev = kzalloc(sizeof *dev, GFP_KERNEL);
  1956. if (!dev)
  1957. return -ENOMEM;
  1958. dev->id = cpu;
  1959. dev->bus = &mce_subsys;
  1960. dev->release = &mce_device_release;
  1961. err = device_register(dev);
  1962. if (err) {
  1963. put_device(dev);
  1964. return err;
  1965. }
  1966. for (i = 0; mce_device_attrs[i]; i++) {
  1967. err = device_create_file(dev, mce_device_attrs[i]);
  1968. if (err)
  1969. goto error;
  1970. }
  1971. for (j = 0; j < mca_cfg.banks; j++) {
  1972. err = device_create_file(dev, &mce_banks[j].attr);
  1973. if (err)
  1974. goto error2;
  1975. }
  1976. cpumask_set_cpu(cpu, mce_device_initialized);
  1977. per_cpu(mce_device, cpu) = dev;
  1978. return 0;
  1979. error2:
  1980. while (--j >= 0)
  1981. device_remove_file(dev, &mce_banks[j].attr);
  1982. error:
  1983. while (--i >= 0)
  1984. device_remove_file(dev, mce_device_attrs[i]);
  1985. device_unregister(dev);
  1986. return err;
  1987. }
  1988. static void mce_device_remove(unsigned int cpu)
  1989. {
  1990. struct device *dev = per_cpu(mce_device, cpu);
  1991. int i;
  1992. if (!cpumask_test_cpu(cpu, mce_device_initialized))
  1993. return;
  1994. for (i = 0; mce_device_attrs[i]; i++)
  1995. device_remove_file(dev, mce_device_attrs[i]);
  1996. for (i = 0; i < mca_cfg.banks; i++)
  1997. device_remove_file(dev, &mce_banks[i].attr);
  1998. device_unregister(dev);
  1999. cpumask_clear_cpu(cpu, mce_device_initialized);
  2000. per_cpu(mce_device, cpu) = NULL;
  2001. }
  2002. /* Make sure there are no machine checks on offlined CPUs. */
  2003. static void mce_disable_cpu(void *h)
  2004. {
  2005. unsigned long action = *(unsigned long *)h;
  2006. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  2007. return;
  2008. if (!(action & CPU_TASKS_FROZEN))
  2009. cmci_clear();
  2010. vendor_disable_error_reporting();
  2011. }
  2012. static void mce_reenable_cpu(void *h)
  2013. {
  2014. unsigned long action = *(unsigned long *)h;
  2015. int i;
  2016. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  2017. return;
  2018. if (!(action & CPU_TASKS_FROZEN))
  2019. cmci_reenable();
  2020. for (i = 0; i < mca_cfg.banks; i++) {
  2021. struct mce_bank *b = &mce_banks[i];
  2022. if (b->init)
  2023. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  2024. }
  2025. }
  2026. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  2027. static int
  2028. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  2029. {
  2030. unsigned int cpu = (unsigned long)hcpu;
  2031. struct timer_list *t = &per_cpu(mce_timer, cpu);
  2032. switch (action & ~CPU_TASKS_FROZEN) {
  2033. case CPU_ONLINE:
  2034. mce_device_create(cpu);
  2035. if (threshold_cpu_callback)
  2036. threshold_cpu_callback(action, cpu);
  2037. break;
  2038. case CPU_DEAD:
  2039. if (threshold_cpu_callback)
  2040. threshold_cpu_callback(action, cpu);
  2041. mce_device_remove(cpu);
  2042. mce_intel_hcpu_update(cpu);
  2043. /* intentionally ignoring frozen here */
  2044. if (!(action & CPU_TASKS_FROZEN))
  2045. cmci_rediscover();
  2046. break;
  2047. case CPU_DOWN_PREPARE:
  2048. smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
  2049. del_timer_sync(t);
  2050. break;
  2051. case CPU_DOWN_FAILED:
  2052. smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
  2053. mce_start_timer(cpu, t);
  2054. break;
  2055. }
  2056. return NOTIFY_OK;
  2057. }
  2058. static struct notifier_block mce_cpu_notifier = {
  2059. .notifier_call = mce_cpu_callback,
  2060. };
  2061. static __init void mce_init_banks(void)
  2062. {
  2063. int i;
  2064. for (i = 0; i < mca_cfg.banks; i++) {
  2065. struct mce_bank *b = &mce_banks[i];
  2066. struct device_attribute *a = &b->attr;
  2067. sysfs_attr_init(&a->attr);
  2068. a->attr.name = b->attrname;
  2069. snprintf(b->attrname, ATTR_LEN, "bank%d", i);
  2070. a->attr.mode = 0644;
  2071. a->show = show_bank;
  2072. a->store = set_bank;
  2073. }
  2074. }
  2075. static __init int mcheck_init_device(void)
  2076. {
  2077. int err;
  2078. int i = 0;
  2079. if (!mce_available(&boot_cpu_data)) {
  2080. err = -EIO;
  2081. goto err_out;
  2082. }
  2083. if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
  2084. err = -ENOMEM;
  2085. goto err_out;
  2086. }
  2087. mce_init_banks();
  2088. err = subsys_system_register(&mce_subsys, NULL);
  2089. if (err)
  2090. goto err_out_mem;
  2091. cpu_notifier_register_begin();
  2092. for_each_online_cpu(i) {
  2093. err = mce_device_create(i);
  2094. if (err) {
  2095. /*
  2096. * Register notifier anyway (and do not unreg it) so
  2097. * that we don't leave undeleted timers, see notifier
  2098. * callback above.
  2099. */
  2100. __register_hotcpu_notifier(&mce_cpu_notifier);
  2101. cpu_notifier_register_done();
  2102. goto err_device_create;
  2103. }
  2104. }
  2105. __register_hotcpu_notifier(&mce_cpu_notifier);
  2106. cpu_notifier_register_done();
  2107. register_syscore_ops(&mce_syscore_ops);
  2108. /* register character device /dev/mcelog */
  2109. err = misc_register(&mce_chrdev_device);
  2110. if (err)
  2111. goto err_register;
  2112. return 0;
  2113. err_register:
  2114. unregister_syscore_ops(&mce_syscore_ops);
  2115. err_device_create:
  2116. /*
  2117. * We didn't keep track of which devices were created above, but
  2118. * even if we had, the set of online cpus might have changed.
  2119. * Play safe and remove for every possible cpu, since
  2120. * mce_device_remove() will do the right thing.
  2121. */
  2122. for_each_possible_cpu(i)
  2123. mce_device_remove(i);
  2124. err_out_mem:
  2125. free_cpumask_var(mce_device_initialized);
  2126. err_out:
  2127. pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
  2128. return err;
  2129. }
  2130. device_initcall_sync(mcheck_init_device);
  2131. /*
  2132. * Old style boot options parsing. Only for compatibility.
  2133. */
  2134. static int __init mcheck_disable(char *str)
  2135. {
  2136. mca_cfg.disabled = true;
  2137. return 1;
  2138. }
  2139. __setup("nomce", mcheck_disable);
  2140. #ifdef CONFIG_DEBUG_FS
  2141. struct dentry *mce_get_debugfs_dir(void)
  2142. {
  2143. static struct dentry *dmce;
  2144. if (!dmce)
  2145. dmce = debugfs_create_dir("mce", NULL);
  2146. return dmce;
  2147. }
  2148. static void mce_reset(void)
  2149. {
  2150. cpu_missing = 0;
  2151. atomic_set(&mce_fake_panicked, 0);
  2152. atomic_set(&mce_executing, 0);
  2153. atomic_set(&mce_callin, 0);
  2154. atomic_set(&global_nwo, 0);
  2155. }
  2156. static int fake_panic_get(void *data, u64 *val)
  2157. {
  2158. *val = fake_panic;
  2159. return 0;
  2160. }
  2161. static int fake_panic_set(void *data, u64 val)
  2162. {
  2163. mce_reset();
  2164. fake_panic = val;
  2165. return 0;
  2166. }
  2167. DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
  2168. fake_panic_set, "%llu\n");
  2169. static int __init mcheck_debugfs_init(void)
  2170. {
  2171. struct dentry *dmce, *ffake_panic;
  2172. dmce = mce_get_debugfs_dir();
  2173. if (!dmce)
  2174. return -ENOMEM;
  2175. ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
  2176. &fake_panic_fops);
  2177. if (!ffake_panic)
  2178. return -ENOMEM;
  2179. return 0;
  2180. }
  2181. #else
  2182. static int __init mcheck_debugfs_init(void) { return -EINVAL; }
  2183. #endif
  2184. static int __init mcheck_late_init(void)
  2185. {
  2186. mcheck_debugfs_init();
  2187. /*
  2188. * Flush out everything that has been logged during early boot, now that
  2189. * everything has been initialized (workqueues, decoders, ...).
  2190. */
  2191. mce_schedule_work();
  2192. return 0;
  2193. }
  2194. late_initcall(mcheck_late_init);