mce.c 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595
  1. /*
  2. * Machine check handler.
  3. *
  4. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5. * Rest from unknown author(s).
  6. * 2004 Andi Kleen. Rewrote most of it.
  7. * Copyright 2008 Intel Corporation
  8. * Author: Andi Kleen
  9. */
  10. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11. #include <linux/thread_info.h>
  12. #include <linux/capability.h>
  13. #include <linux/miscdevice.h>
  14. #include <linux/ratelimit.h>
  15. #include <linux/kallsyms.h>
  16. #include <linux/rcupdate.h>
  17. #include <linux/kobject.h>
  18. #include <linux/uaccess.h>
  19. #include <linux/kdebug.h>
  20. #include <linux/kernel.h>
  21. #include <linux/percpu.h>
  22. #include <linux/string.h>
  23. #include <linux/device.h>
  24. #include <linux/syscore_ops.h>
  25. #include <linux/delay.h>
  26. #include <linux/ctype.h>
  27. #include <linux/sched.h>
  28. #include <linux/sysfs.h>
  29. #include <linux/types.h>
  30. #include <linux/slab.h>
  31. #include <linux/init.h>
  32. #include <linux/kmod.h>
  33. #include <linux/poll.h>
  34. #include <linux/nmi.h>
  35. #include <linux/cpu.h>
  36. #include <linux/smp.h>
  37. #include <linux/fs.h>
  38. #include <linux/mm.h>
  39. #include <linux/debugfs.h>
  40. #include <linux/irq_work.h>
  41. #include <linux/export.h>
  42. #include <asm/processor.h>
  43. #include <asm/traps.h>
  44. #include <asm/tlbflush.h>
  45. #include <asm/mce.h>
  46. #include <asm/msr.h>
  47. #include "mce-internal.h"
  48. static DEFINE_MUTEX(mce_chrdev_read_mutex);
  49. #define rcu_dereference_check_mce(p) \
  50. ({ \
  51. rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  52. lockdep_is_held(&mce_chrdev_read_mutex), \
  53. "suspicious rcu_dereference_check_mce() usage"); \
  54. smp_load_acquire(&(p)); \
  55. })
  56. #define CREATE_TRACE_POINTS
  57. #include <trace/events/mce.h>
  58. #define SPINUNIT 100 /* 100ns */
  59. DEFINE_PER_CPU(unsigned, mce_exception_count);
  60. struct mce_bank *mce_banks __read_mostly;
  61. struct mce_vendor_flags mce_flags __read_mostly;
  62. struct mca_config mca_cfg __read_mostly = {
  63. .bootlog = -1,
  64. /*
  65. * Tolerant levels:
  66. * 0: always panic on uncorrected errors, log corrected errors
  67. * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  68. * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  69. * 3: never panic or SIGBUS, log all errors (for testing only)
  70. */
  71. .tolerant = 1,
  72. .monarch_timeout = -1
  73. };
  74. /* User mode helper program triggered by machine check event */
  75. static unsigned long mce_need_notify;
  76. static char mce_helper[128];
  77. static char *mce_helper_argv[2] = { mce_helper, NULL };
  78. static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  79. static DEFINE_PER_CPU(struct mce, mces_seen);
  80. static int cpu_missing;
  81. /*
  82. * MCA banks polled by the period polling timer for corrected events.
  83. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  84. */
  85. DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  86. [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  87. };
  88. /*
  89. * MCA banks controlled through firmware first for corrected errors.
  90. * This is a global list of banks for which we won't enable CMCI and we
  91. * won't poll. Firmware controls these banks and is responsible for
  92. * reporting corrected errors through GHES. Uncorrected/recoverable
  93. * errors are still notified through a machine check.
  94. */
  95. mce_banks_t mce_banks_ce_disabled;
  96. static DEFINE_PER_CPU(struct work_struct, mce_work);
  97. static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  98. /*
  99. * CPU/chipset specific EDAC code can register a notifier call here to print
  100. * MCE errors in a human-readable form.
  101. */
  102. static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
  103. /* Do initial initialization of a struct mce */
  104. void mce_setup(struct mce *m)
  105. {
  106. memset(m, 0, sizeof(struct mce));
  107. m->cpu = m->extcpu = smp_processor_id();
  108. rdtscll(m->tsc);
  109. /* We hope get_seconds stays lockless */
  110. m->time = get_seconds();
  111. m->cpuvendor = boot_cpu_data.x86_vendor;
  112. m->cpuid = cpuid_eax(1);
  113. m->socketid = cpu_data(m->extcpu).phys_proc_id;
  114. m->apicid = cpu_data(m->extcpu).initial_apicid;
  115. rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
  116. }
  117. DEFINE_PER_CPU(struct mce, injectm);
  118. EXPORT_PER_CPU_SYMBOL_GPL(injectm);
  119. /*
  120. * Lockless MCE logging infrastructure.
  121. * This avoids deadlocks on printk locks without having to break locks. Also
  122. * separate MCEs from kernel messages to avoid bogus bug reports.
  123. */
  124. static struct mce_log mcelog = {
  125. .signature = MCE_LOG_SIGNATURE,
  126. .len = MCE_LOG_LEN,
  127. .recordlen = sizeof(struct mce),
  128. };
  129. void mce_log(struct mce *mce)
  130. {
  131. unsigned next, entry;
  132. /* Emit the trace record: */
  133. trace_mce_record(mce);
  134. atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
  135. mce->finished = 0;
  136. wmb();
  137. for (;;) {
  138. entry = rcu_dereference_check_mce(mcelog.next);
  139. for (;;) {
  140. /*
  141. * When the buffer fills up discard new entries.
  142. * Assume that the earlier errors are the more
  143. * interesting ones:
  144. */
  145. if (entry >= MCE_LOG_LEN) {
  146. set_bit(MCE_OVERFLOW,
  147. (unsigned long *)&mcelog.flags);
  148. return;
  149. }
  150. /* Old left over entry. Skip: */
  151. if (mcelog.entry[entry].finished) {
  152. entry++;
  153. continue;
  154. }
  155. break;
  156. }
  157. smp_rmb();
  158. next = entry + 1;
  159. if (cmpxchg(&mcelog.next, entry, next) == entry)
  160. break;
  161. }
  162. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  163. wmb();
  164. mcelog.entry[entry].finished = 1;
  165. wmb();
  166. mce->finished = 1;
  167. set_bit(0, &mce_need_notify);
  168. }
  169. static void drain_mcelog_buffer(void)
  170. {
  171. unsigned int next, i, prev = 0;
  172. next = ACCESS_ONCE(mcelog.next);
  173. do {
  174. struct mce *m;
  175. /* drain what was logged during boot */
  176. for (i = prev; i < next; i++) {
  177. unsigned long start = jiffies;
  178. unsigned retries = 1;
  179. m = &mcelog.entry[i];
  180. while (!m->finished) {
  181. if (time_after_eq(jiffies, start + 2*retries))
  182. retries++;
  183. cpu_relax();
  184. if (!m->finished && retries >= 4) {
  185. pr_err("skipping error being logged currently!\n");
  186. break;
  187. }
  188. }
  189. smp_rmb();
  190. atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
  191. }
  192. memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
  193. prev = next;
  194. next = cmpxchg(&mcelog.next, prev, 0);
  195. } while (next != prev);
  196. }
  197. void mce_register_decode_chain(struct notifier_block *nb)
  198. {
  199. atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
  200. drain_mcelog_buffer();
  201. }
  202. EXPORT_SYMBOL_GPL(mce_register_decode_chain);
  203. void mce_unregister_decode_chain(struct notifier_block *nb)
  204. {
  205. atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
  206. }
  207. EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
  208. static void print_mce(struct mce *m)
  209. {
  210. int ret = 0;
  211. pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
  212. m->extcpu, m->mcgstatus, m->bank, m->status);
  213. if (m->ip) {
  214. pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
  215. !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  216. m->cs, m->ip);
  217. if (m->cs == __KERNEL_CS)
  218. print_symbol("{%s}", m->ip);
  219. pr_cont("\n");
  220. }
  221. pr_emerg(HW_ERR "TSC %llx ", m->tsc);
  222. if (m->addr)
  223. pr_cont("ADDR %llx ", m->addr);
  224. if (m->misc)
  225. pr_cont("MISC %llx ", m->misc);
  226. pr_cont("\n");
  227. /*
  228. * Note this output is parsed by external tools and old fields
  229. * should not be changed.
  230. */
  231. pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
  232. m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
  233. cpu_data(m->extcpu).microcode);
  234. /*
  235. * Print out human-readable details about the MCE error,
  236. * (if the CPU has an implementation for that)
  237. */
  238. ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
  239. if (ret == NOTIFY_STOP)
  240. return;
  241. pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
  242. }
  243. #define PANIC_TIMEOUT 5 /* 5 seconds */
  244. static atomic_t mce_panicked;
  245. static int fake_panic;
  246. static atomic_t mce_fake_panicked;
  247. /* Panic in progress. Enable interrupts and wait for final IPI */
  248. static void wait_for_panic(void)
  249. {
  250. long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
  251. preempt_disable();
  252. local_irq_enable();
  253. while (timeout-- > 0)
  254. udelay(1);
  255. if (panic_timeout == 0)
  256. panic_timeout = mca_cfg.panic_timeout;
  257. panic("Panicing machine check CPU died");
  258. }
  259. static void mce_panic(const char *msg, struct mce *final, char *exp)
  260. {
  261. int i, apei_err = 0;
  262. if (!fake_panic) {
  263. /*
  264. * Make sure only one CPU runs in machine check panic
  265. */
  266. if (atomic_inc_return(&mce_panicked) > 1)
  267. wait_for_panic();
  268. barrier();
  269. bust_spinlocks(1);
  270. console_verbose();
  271. } else {
  272. /* Don't log too much for fake panic */
  273. if (atomic_inc_return(&mce_fake_panicked) > 1)
  274. return;
  275. }
  276. /* First print corrected ones that are still unlogged */
  277. for (i = 0; i < MCE_LOG_LEN; i++) {
  278. struct mce *m = &mcelog.entry[i];
  279. if (!(m->status & MCI_STATUS_VAL))
  280. continue;
  281. if (!(m->status & MCI_STATUS_UC)) {
  282. print_mce(m);
  283. if (!apei_err)
  284. apei_err = apei_write_mce(m);
  285. }
  286. }
  287. /* Now print uncorrected but with the final one last */
  288. for (i = 0; i < MCE_LOG_LEN; i++) {
  289. struct mce *m = &mcelog.entry[i];
  290. if (!(m->status & MCI_STATUS_VAL))
  291. continue;
  292. if (!(m->status & MCI_STATUS_UC))
  293. continue;
  294. if (!final || memcmp(m, final, sizeof(struct mce))) {
  295. print_mce(m);
  296. if (!apei_err)
  297. apei_err = apei_write_mce(m);
  298. }
  299. }
  300. if (final) {
  301. print_mce(final);
  302. if (!apei_err)
  303. apei_err = apei_write_mce(final);
  304. }
  305. if (cpu_missing)
  306. pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
  307. if (exp)
  308. pr_emerg(HW_ERR "Machine check: %s\n", exp);
  309. if (!fake_panic) {
  310. if (panic_timeout == 0)
  311. panic_timeout = mca_cfg.panic_timeout;
  312. panic(msg);
  313. } else
  314. pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
  315. }
  316. /* Support code for software error injection */
  317. static int msr_to_offset(u32 msr)
  318. {
  319. unsigned bank = __this_cpu_read(injectm.bank);
  320. if (msr == mca_cfg.rip_msr)
  321. return offsetof(struct mce, ip);
  322. if (msr == MSR_IA32_MCx_STATUS(bank))
  323. return offsetof(struct mce, status);
  324. if (msr == MSR_IA32_MCx_ADDR(bank))
  325. return offsetof(struct mce, addr);
  326. if (msr == MSR_IA32_MCx_MISC(bank))
  327. return offsetof(struct mce, misc);
  328. if (msr == MSR_IA32_MCG_STATUS)
  329. return offsetof(struct mce, mcgstatus);
  330. return -1;
  331. }
  332. /* MSR access wrappers used for error injection */
  333. static u64 mce_rdmsrl(u32 msr)
  334. {
  335. u64 v;
  336. if (__this_cpu_read(injectm.finished)) {
  337. int offset = msr_to_offset(msr);
  338. if (offset < 0)
  339. return 0;
  340. return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
  341. }
  342. if (rdmsrl_safe(msr, &v)) {
  343. WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
  344. /*
  345. * Return zero in case the access faulted. This should
  346. * not happen normally but can happen if the CPU does
  347. * something weird, or if the code is buggy.
  348. */
  349. v = 0;
  350. }
  351. return v;
  352. }
  353. static void mce_wrmsrl(u32 msr, u64 v)
  354. {
  355. if (__this_cpu_read(injectm.finished)) {
  356. int offset = msr_to_offset(msr);
  357. if (offset >= 0)
  358. *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
  359. return;
  360. }
  361. wrmsrl(msr, v);
  362. }
  363. /*
  364. * Collect all global (w.r.t. this processor) status about this machine
  365. * check into our "mce" struct so that we can use it later to assess
  366. * the severity of the problem as we read per-bank specific details.
  367. */
  368. static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
  369. {
  370. mce_setup(m);
  371. m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
  372. if (regs) {
  373. /*
  374. * Get the address of the instruction at the time of
  375. * the machine check error.
  376. */
  377. if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
  378. m->ip = regs->ip;
  379. m->cs = regs->cs;
  380. /*
  381. * When in VM86 mode make the cs look like ring 3
  382. * always. This is a lie, but it's better than passing
  383. * the additional vm86 bit around everywhere.
  384. */
  385. if (v8086_mode(regs))
  386. m->cs |= 3;
  387. }
  388. /* Use accurate RIP reporting if available. */
  389. if (mca_cfg.rip_msr)
  390. m->ip = mce_rdmsrl(mca_cfg.rip_msr);
  391. }
  392. }
  393. /*
  394. * Simple lockless ring to communicate PFNs from the exception handler with the
  395. * process context work function. This is vastly simplified because there's
  396. * only a single reader and a single writer.
  397. */
  398. #define MCE_RING_SIZE 16 /* we use one entry less */
  399. struct mce_ring {
  400. unsigned short start;
  401. unsigned short end;
  402. unsigned long ring[MCE_RING_SIZE];
  403. };
  404. static DEFINE_PER_CPU(struct mce_ring, mce_ring);
  405. /* Runs with CPU affinity in workqueue */
  406. static int mce_ring_empty(void)
  407. {
  408. struct mce_ring *r = this_cpu_ptr(&mce_ring);
  409. return r->start == r->end;
  410. }
  411. static int mce_ring_get(unsigned long *pfn)
  412. {
  413. struct mce_ring *r;
  414. int ret = 0;
  415. *pfn = 0;
  416. get_cpu();
  417. r = this_cpu_ptr(&mce_ring);
  418. if (r->start == r->end)
  419. goto out;
  420. *pfn = r->ring[r->start];
  421. r->start = (r->start + 1) % MCE_RING_SIZE;
  422. ret = 1;
  423. out:
  424. put_cpu();
  425. return ret;
  426. }
  427. /* Always runs in MCE context with preempt off */
  428. static int mce_ring_add(unsigned long pfn)
  429. {
  430. struct mce_ring *r = this_cpu_ptr(&mce_ring);
  431. unsigned next;
  432. next = (r->end + 1) % MCE_RING_SIZE;
  433. if (next == r->start)
  434. return -1;
  435. r->ring[r->end] = pfn;
  436. wmb();
  437. r->end = next;
  438. return 0;
  439. }
  440. int mce_available(struct cpuinfo_x86 *c)
  441. {
  442. if (mca_cfg.disabled)
  443. return 0;
  444. return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
  445. }
  446. static void mce_schedule_work(void)
  447. {
  448. if (!mce_ring_empty())
  449. schedule_work(this_cpu_ptr(&mce_work));
  450. }
  451. static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
  452. static void mce_irq_work_cb(struct irq_work *entry)
  453. {
  454. mce_notify_irq();
  455. mce_schedule_work();
  456. }
  457. static void mce_report_event(struct pt_regs *regs)
  458. {
  459. if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
  460. mce_notify_irq();
  461. /*
  462. * Triggering the work queue here is just an insurance
  463. * policy in case the syscall exit notify handler
  464. * doesn't run soon enough or ends up running on the
  465. * wrong CPU (can happen when audit sleeps)
  466. */
  467. mce_schedule_work();
  468. return;
  469. }
  470. irq_work_queue(this_cpu_ptr(&mce_irq_work));
  471. }
  472. /*
  473. * Read ADDR and MISC registers.
  474. */
  475. static void mce_read_aux(struct mce *m, int i)
  476. {
  477. if (m->status & MCI_STATUS_MISCV)
  478. m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
  479. if (m->status & MCI_STATUS_ADDRV) {
  480. m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
  481. /*
  482. * Mask the reported address by the reported granularity.
  483. */
  484. if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
  485. u8 shift = MCI_MISC_ADDR_LSB(m->misc);
  486. m->addr >>= shift;
  487. m->addr <<= shift;
  488. }
  489. }
  490. }
  491. static bool memory_error(struct mce *m)
  492. {
  493. struct cpuinfo_x86 *c = &boot_cpu_data;
  494. if (c->x86_vendor == X86_VENDOR_AMD) {
  495. /*
  496. * coming soon
  497. */
  498. return false;
  499. } else if (c->x86_vendor == X86_VENDOR_INTEL) {
  500. /*
  501. * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
  502. *
  503. * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
  504. * indicating a memory error. Bit 8 is used for indicating a
  505. * cache hierarchy error. The combination of bit 2 and bit 3
  506. * is used for indicating a `generic' cache hierarchy error
  507. * But we can't just blindly check the above bits, because if
  508. * bit 11 is set, then it is a bus/interconnect error - and
  509. * either way the above bits just gives more detail on what
  510. * bus/interconnect error happened. Note that bit 12 can be
  511. * ignored, as it's the "filter" bit.
  512. */
  513. return (m->status & 0xef80) == BIT(7) ||
  514. (m->status & 0xef00) == BIT(8) ||
  515. (m->status & 0xeffc) == 0xc;
  516. }
  517. return false;
  518. }
  519. DEFINE_PER_CPU(unsigned, mce_poll_count);
  520. /*
  521. * Poll for corrected events or events that happened before reset.
  522. * Those are just logged through /dev/mcelog.
  523. *
  524. * This is executed in standard interrupt context.
  525. *
  526. * Note: spec recommends to panic for fatal unsignalled
  527. * errors here. However this would be quite problematic --
  528. * we would need to reimplement the Monarch handling and
  529. * it would mess up the exclusion between exception handler
  530. * and poll hander -- * so we skip this for now.
  531. * These cases should not happen anyways, or only when the CPU
  532. * is already totally * confused. In this case it's likely it will
  533. * not fully execute the machine check handler either.
  534. */
  535. bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  536. {
  537. bool error_logged = false;
  538. struct mce m;
  539. int severity;
  540. int i;
  541. this_cpu_inc(mce_poll_count);
  542. mce_gather_info(&m, NULL);
  543. for (i = 0; i < mca_cfg.banks; i++) {
  544. if (!mce_banks[i].ctl || !test_bit(i, *b))
  545. continue;
  546. m.misc = 0;
  547. m.addr = 0;
  548. m.bank = i;
  549. m.tsc = 0;
  550. barrier();
  551. m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  552. if (!(m.status & MCI_STATUS_VAL))
  553. continue;
  554. /*
  555. * Uncorrected or signalled events are handled by the exception
  556. * handler when it is enabled, so don't process those here.
  557. *
  558. * TBD do the same check for MCI_STATUS_EN here?
  559. */
  560. if (!(flags & MCP_UC) &&
  561. (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
  562. continue;
  563. mce_read_aux(&m, i);
  564. if (!(flags & MCP_TIMESTAMP))
  565. m.tsc = 0;
  566. severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
  567. /*
  568. * In the cases where we don't have a valid address after all,
  569. * do not add it into the ring buffer.
  570. */
  571. if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
  572. if (m.status & MCI_STATUS_ADDRV) {
  573. mce_ring_add(m.addr >> PAGE_SHIFT);
  574. mce_schedule_work();
  575. }
  576. }
  577. /*
  578. * Don't get the IP here because it's unlikely to
  579. * have anything to do with the actual error location.
  580. */
  581. if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
  582. error_logged = true;
  583. mce_log(&m);
  584. }
  585. /*
  586. * Clear state for this bank.
  587. */
  588. mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  589. }
  590. /*
  591. * Don't clear MCG_STATUS here because it's only defined for
  592. * exceptions.
  593. */
  594. sync_core();
  595. return error_logged;
  596. }
  597. EXPORT_SYMBOL_GPL(machine_check_poll);
  598. /*
  599. * Do a quick check if any of the events requires a panic.
  600. * This decides if we keep the events around or clear them.
  601. */
  602. static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
  603. struct pt_regs *regs)
  604. {
  605. int i, ret = 0;
  606. char *tmp;
  607. for (i = 0; i < mca_cfg.banks; i++) {
  608. m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  609. if (m->status & MCI_STATUS_VAL) {
  610. __set_bit(i, validp);
  611. if (quirk_no_way_out)
  612. quirk_no_way_out(i, m, regs);
  613. }
  614. if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
  615. *msg = tmp;
  616. ret = 1;
  617. }
  618. }
  619. return ret;
  620. }
  621. /*
  622. * Variable to establish order between CPUs while scanning.
  623. * Each CPU spins initially until executing is equal its number.
  624. */
  625. static atomic_t mce_executing;
  626. /*
  627. * Defines order of CPUs on entry. First CPU becomes Monarch.
  628. */
  629. static atomic_t mce_callin;
  630. /*
  631. * Check if a timeout waiting for other CPUs happened.
  632. */
  633. static int mce_timed_out(u64 *t, const char *msg)
  634. {
  635. /*
  636. * The others already did panic for some reason.
  637. * Bail out like in a timeout.
  638. * rmb() to tell the compiler that system_state
  639. * might have been modified by someone else.
  640. */
  641. rmb();
  642. if (atomic_read(&mce_panicked))
  643. wait_for_panic();
  644. if (!mca_cfg.monarch_timeout)
  645. goto out;
  646. if ((s64)*t < SPINUNIT) {
  647. if (mca_cfg.tolerant <= 1)
  648. mce_panic(msg, NULL, NULL);
  649. cpu_missing = 1;
  650. return 1;
  651. }
  652. *t -= SPINUNIT;
  653. out:
  654. touch_nmi_watchdog();
  655. return 0;
  656. }
  657. /*
  658. * The Monarch's reign. The Monarch is the CPU who entered
  659. * the machine check handler first. It waits for the others to
  660. * raise the exception too and then grades them. When any
  661. * error is fatal panic. Only then let the others continue.
  662. *
  663. * The other CPUs entering the MCE handler will be controlled by the
  664. * Monarch. They are called Subjects.
  665. *
  666. * This way we prevent any potential data corruption in a unrecoverable case
  667. * and also makes sure always all CPU's errors are examined.
  668. *
  669. * Also this detects the case of a machine check event coming from outer
  670. * space (not detected by any CPUs) In this case some external agent wants
  671. * us to shut down, so panic too.
  672. *
  673. * The other CPUs might still decide to panic if the handler happens
  674. * in a unrecoverable place, but in this case the system is in a semi-stable
  675. * state and won't corrupt anything by itself. It's ok to let the others
  676. * continue for a bit first.
  677. *
  678. * All the spin loops have timeouts; when a timeout happens a CPU
  679. * typically elects itself to be Monarch.
  680. */
  681. static void mce_reign(void)
  682. {
  683. int cpu;
  684. struct mce *m = NULL;
  685. int global_worst = 0;
  686. char *msg = NULL;
  687. char *nmsg = NULL;
  688. /*
  689. * This CPU is the Monarch and the other CPUs have run
  690. * through their handlers.
  691. * Grade the severity of the errors of all the CPUs.
  692. */
  693. for_each_possible_cpu(cpu) {
  694. int severity = mce_severity(&per_cpu(mces_seen, cpu),
  695. mca_cfg.tolerant,
  696. &nmsg, true);
  697. if (severity > global_worst) {
  698. msg = nmsg;
  699. global_worst = severity;
  700. m = &per_cpu(mces_seen, cpu);
  701. }
  702. }
  703. /*
  704. * Cannot recover? Panic here then.
  705. * This dumps all the mces in the log buffer and stops the
  706. * other CPUs.
  707. */
  708. if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  709. mce_panic("Fatal machine check", m, msg);
  710. /*
  711. * For UC somewhere we let the CPU who detects it handle it.
  712. * Also must let continue the others, otherwise the handling
  713. * CPU could deadlock on a lock.
  714. */
  715. /*
  716. * No machine check event found. Must be some external
  717. * source or one CPU is hung. Panic.
  718. */
  719. if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
  720. mce_panic("Fatal machine check from unknown source", NULL, NULL);
  721. /*
  722. * Now clear all the mces_seen so that they don't reappear on
  723. * the next mce.
  724. */
  725. for_each_possible_cpu(cpu)
  726. memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
  727. }
  728. static atomic_t global_nwo;
  729. /*
  730. * Start of Monarch synchronization. This waits until all CPUs have
  731. * entered the exception handler and then determines if any of them
  732. * saw a fatal event that requires panic. Then it executes them
  733. * in the entry order.
  734. * TBD double check parallel CPU hotunplug
  735. */
  736. static int mce_start(int *no_way_out)
  737. {
  738. int order;
  739. int cpus = num_online_cpus();
  740. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  741. if (!timeout)
  742. return -1;
  743. atomic_add(*no_way_out, &global_nwo);
  744. /*
  745. * global_nwo should be updated before mce_callin
  746. */
  747. smp_wmb();
  748. order = atomic_inc_return(&mce_callin);
  749. /*
  750. * Wait for everyone.
  751. */
  752. while (atomic_read(&mce_callin) != cpus) {
  753. if (mce_timed_out(&timeout,
  754. "Timeout: Not all CPUs entered broadcast exception handler")) {
  755. atomic_set(&global_nwo, 0);
  756. return -1;
  757. }
  758. ndelay(SPINUNIT);
  759. }
  760. /*
  761. * mce_callin should be read before global_nwo
  762. */
  763. smp_rmb();
  764. if (order == 1) {
  765. /*
  766. * Monarch: Starts executing now, the others wait.
  767. */
  768. atomic_set(&mce_executing, 1);
  769. } else {
  770. /*
  771. * Subject: Now start the scanning loop one by one in
  772. * the original callin order.
  773. * This way when there are any shared banks it will be
  774. * only seen by one CPU before cleared, avoiding duplicates.
  775. */
  776. while (atomic_read(&mce_executing) < order) {
  777. if (mce_timed_out(&timeout,
  778. "Timeout: Subject CPUs unable to finish machine check processing")) {
  779. atomic_set(&global_nwo, 0);
  780. return -1;
  781. }
  782. ndelay(SPINUNIT);
  783. }
  784. }
  785. /*
  786. * Cache the global no_way_out state.
  787. */
  788. *no_way_out = atomic_read(&global_nwo);
  789. return order;
  790. }
  791. /*
  792. * Synchronize between CPUs after main scanning loop.
  793. * This invokes the bulk of the Monarch processing.
  794. */
  795. static int mce_end(int order)
  796. {
  797. int ret = -1;
  798. u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
  799. if (!timeout)
  800. goto reset;
  801. if (order < 0)
  802. goto reset;
  803. /*
  804. * Allow others to run.
  805. */
  806. atomic_inc(&mce_executing);
  807. if (order == 1) {
  808. /* CHECKME: Can this race with a parallel hotplug? */
  809. int cpus = num_online_cpus();
  810. /*
  811. * Monarch: Wait for everyone to go through their scanning
  812. * loops.
  813. */
  814. while (atomic_read(&mce_executing) <= cpus) {
  815. if (mce_timed_out(&timeout,
  816. "Timeout: Monarch CPU unable to finish machine check processing"))
  817. goto reset;
  818. ndelay(SPINUNIT);
  819. }
  820. mce_reign();
  821. barrier();
  822. ret = 0;
  823. } else {
  824. /*
  825. * Subject: Wait for Monarch to finish.
  826. */
  827. while (atomic_read(&mce_executing) != 0) {
  828. if (mce_timed_out(&timeout,
  829. "Timeout: Monarch CPU did not finish machine check processing"))
  830. goto reset;
  831. ndelay(SPINUNIT);
  832. }
  833. /*
  834. * Don't reset anything. That's done by the Monarch.
  835. */
  836. return 0;
  837. }
  838. /*
  839. * Reset all global state.
  840. */
  841. reset:
  842. atomic_set(&global_nwo, 0);
  843. atomic_set(&mce_callin, 0);
  844. barrier();
  845. /*
  846. * Let others run again.
  847. */
  848. atomic_set(&mce_executing, 0);
  849. return ret;
  850. }
  851. /*
  852. * Check if the address reported by the CPU is in a format we can parse.
  853. * It would be possible to add code for most other cases, but all would
  854. * be somewhat complicated (e.g. segment offset would require an instruction
  855. * parser). So only support physical addresses up to page granuality for now.
  856. */
  857. static int mce_usable_address(struct mce *m)
  858. {
  859. if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
  860. return 0;
  861. if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
  862. return 0;
  863. if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
  864. return 0;
  865. return 1;
  866. }
  867. static void mce_clear_state(unsigned long *toclear)
  868. {
  869. int i;
  870. for (i = 0; i < mca_cfg.banks; i++) {
  871. if (test_bit(i, toclear))
  872. mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  873. }
  874. }
  875. /*
  876. * The actual machine check handler. This only handles real
  877. * exceptions when something got corrupted coming in through int 18.
  878. *
  879. * This is executed in NMI context not subject to normal locking rules. This
  880. * implies that most kernel services cannot be safely used. Don't even
  881. * think about putting a printk in there!
  882. *
  883. * On Intel systems this is entered on all CPUs in parallel through
  884. * MCE broadcast. However some CPUs might be broken beyond repair,
  885. * so be always careful when synchronizing with others.
  886. */
  887. void do_machine_check(struct pt_regs *regs, long error_code)
  888. {
  889. struct mca_config *cfg = &mca_cfg;
  890. struct mce m, *final;
  891. enum ctx_state prev_state;
  892. int i;
  893. int worst = 0;
  894. int severity;
  895. /*
  896. * Establish sequential order between the CPUs entering the machine
  897. * check handler.
  898. */
  899. int order;
  900. /*
  901. * If no_way_out gets set, there is no safe way to recover from this
  902. * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
  903. */
  904. int no_way_out = 0;
  905. /*
  906. * If kill_it gets set, there might be a way to recover from this
  907. * error.
  908. */
  909. int kill_it = 0;
  910. DECLARE_BITMAP(toclear, MAX_NR_BANKS);
  911. DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
  912. char *msg = "Unknown";
  913. u64 recover_paddr = ~0ull;
  914. int flags = MF_ACTION_REQUIRED;
  915. int lmce = 0;
  916. prev_state = ist_enter(regs);
  917. this_cpu_inc(mce_exception_count);
  918. if (!cfg->banks)
  919. goto out;
  920. mce_gather_info(&m, regs);
  921. final = this_cpu_ptr(&mces_seen);
  922. *final = m;
  923. memset(valid_banks, 0, sizeof(valid_banks));
  924. no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
  925. barrier();
  926. /*
  927. * When no restart IP might need to kill or panic.
  928. * Assume the worst for now, but if we find the
  929. * severity is MCE_AR_SEVERITY we have other options.
  930. */
  931. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  932. kill_it = 1;
  933. /*
  934. * Check if this MCE is signaled to only this logical processor
  935. */
  936. if (m.mcgstatus & MCG_STATUS_LMCES)
  937. lmce = 1;
  938. else {
  939. /*
  940. * Go through all the banks in exclusion of the other CPUs.
  941. * This way we don't report duplicated events on shared banks
  942. * because the first one to see it will clear it.
  943. * If this is a Local MCE, then no need to perform rendezvous.
  944. */
  945. order = mce_start(&no_way_out);
  946. }
  947. for (i = 0; i < cfg->banks; i++) {
  948. __clear_bit(i, toclear);
  949. if (!test_bit(i, valid_banks))
  950. continue;
  951. if (!mce_banks[i].ctl)
  952. continue;
  953. m.misc = 0;
  954. m.addr = 0;
  955. m.bank = i;
  956. m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
  957. if ((m.status & MCI_STATUS_VAL) == 0)
  958. continue;
  959. /*
  960. * Non uncorrected or non signaled errors are handled by
  961. * machine_check_poll. Leave them alone, unless this panics.
  962. */
  963. if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
  964. !no_way_out)
  965. continue;
  966. /*
  967. * Set taint even when machine check was not enabled.
  968. */
  969. add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
  970. severity = mce_severity(&m, cfg->tolerant, NULL, true);
  971. /*
  972. * When machine check was for corrected/deferred handler don't
  973. * touch, unless we're panicing.
  974. */
  975. if ((severity == MCE_KEEP_SEVERITY ||
  976. severity == MCE_UCNA_SEVERITY) && !no_way_out)
  977. continue;
  978. __set_bit(i, toclear);
  979. if (severity == MCE_NO_SEVERITY) {
  980. /*
  981. * Machine check event was not enabled. Clear, but
  982. * ignore.
  983. */
  984. continue;
  985. }
  986. mce_read_aux(&m, i);
  987. /*
  988. * Action optional error. Queue address for later processing.
  989. * When the ring overflows we just ignore the AO error.
  990. * RED-PEN add some logging mechanism when
  991. * usable_address or mce_add_ring fails.
  992. * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
  993. */
  994. if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
  995. mce_ring_add(m.addr >> PAGE_SHIFT);
  996. mce_log(&m);
  997. if (severity > worst) {
  998. *final = m;
  999. worst = severity;
  1000. }
  1001. }
  1002. /* mce_clear_state will clear *final, save locally for use later */
  1003. m = *final;
  1004. if (!no_way_out)
  1005. mce_clear_state(toclear);
  1006. /*
  1007. * Do most of the synchronization with other CPUs.
  1008. * When there's any problem use only local no_way_out state.
  1009. */
  1010. if (!lmce) {
  1011. if (mce_end(order) < 0)
  1012. no_way_out = worst >= MCE_PANIC_SEVERITY;
  1013. } else {
  1014. /*
  1015. * Local MCE skipped calling mce_reign()
  1016. * If we found a fatal error, we need to panic here.
  1017. */
  1018. if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
  1019. mce_panic("Machine check from unknown source",
  1020. NULL, NULL);
  1021. }
  1022. /*
  1023. * At insane "tolerant" levels we take no action. Otherwise
  1024. * we only die if we have no other choice. For less serious
  1025. * issues we try to recover, or limit damage to the current
  1026. * process.
  1027. */
  1028. if (cfg->tolerant < 3) {
  1029. if (no_way_out)
  1030. mce_panic("Fatal machine check on current CPU", &m, msg);
  1031. if (worst == MCE_AR_SEVERITY) {
  1032. recover_paddr = m.addr;
  1033. if (!(m.mcgstatus & MCG_STATUS_RIPV))
  1034. flags |= MF_MUST_KILL;
  1035. } else if (kill_it) {
  1036. force_sig(SIGBUS, current);
  1037. }
  1038. }
  1039. if (worst > 0)
  1040. mce_report_event(regs);
  1041. mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  1042. out:
  1043. sync_core();
  1044. if (recover_paddr == ~0ull)
  1045. goto done;
  1046. pr_err("Uncorrected hardware memory error in user-access at %llx",
  1047. recover_paddr);
  1048. /*
  1049. * We must call memory_failure() here even if the current process is
  1050. * doomed. We still need to mark the page as poisoned and alert any
  1051. * other users of the page.
  1052. */
  1053. ist_begin_non_atomic(regs);
  1054. local_irq_enable();
  1055. if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
  1056. pr_err("Memory error not recovered");
  1057. force_sig(SIGBUS, current);
  1058. }
  1059. local_irq_disable();
  1060. ist_end_non_atomic();
  1061. done:
  1062. ist_exit(regs, prev_state);
  1063. }
  1064. EXPORT_SYMBOL_GPL(do_machine_check);
  1065. #ifndef CONFIG_MEMORY_FAILURE
  1066. int memory_failure(unsigned long pfn, int vector, int flags)
  1067. {
  1068. /* mce_severity() should not hand us an ACTION_REQUIRED error */
  1069. BUG_ON(flags & MF_ACTION_REQUIRED);
  1070. pr_err("Uncorrected memory error in page 0x%lx ignored\n"
  1071. "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
  1072. pfn);
  1073. return 0;
  1074. }
  1075. #endif
  1076. /*
  1077. * Action optional processing happens here (picking up
  1078. * from the list of faulting pages that do_machine_check()
  1079. * placed into the "ring").
  1080. */
  1081. static void mce_process_work(struct work_struct *dummy)
  1082. {
  1083. unsigned long pfn;
  1084. while (mce_ring_get(&pfn))
  1085. memory_failure(pfn, MCE_VECTOR, 0);
  1086. }
  1087. #ifdef CONFIG_X86_MCE_INTEL
  1088. /***
  1089. * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
  1090. * @cpu: The CPU on which the event occurred.
  1091. * @status: Event status information
  1092. *
  1093. * This function should be called by the thermal interrupt after the
  1094. * event has been processed and the decision was made to log the event
  1095. * further.
  1096. *
  1097. * The status parameter will be saved to the 'status' field of 'struct mce'
  1098. * and historically has been the register value of the
  1099. * MSR_IA32_THERMAL_STATUS (Intel) msr.
  1100. */
  1101. void mce_log_therm_throt_event(__u64 status)
  1102. {
  1103. struct mce m;
  1104. mce_setup(&m);
  1105. m.bank = MCE_THERMAL_BANK;
  1106. m.status = status;
  1107. mce_log(&m);
  1108. }
  1109. #endif /* CONFIG_X86_MCE_INTEL */
  1110. /*
  1111. * Periodic polling timer for "silent" machine check errors. If the
  1112. * poller finds an MCE, poll 2x faster. When the poller finds no more
  1113. * errors, poll 2x slower (up to check_interval seconds).
  1114. */
  1115. static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
  1116. static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  1117. static DEFINE_PER_CPU(struct timer_list, mce_timer);
  1118. static unsigned long mce_adjust_timer_default(unsigned long interval)
  1119. {
  1120. return interval;
  1121. }
  1122. static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
  1123. static void __restart_timer(struct timer_list *t, unsigned long interval)
  1124. {
  1125. unsigned long when = jiffies + interval;
  1126. unsigned long flags;
  1127. local_irq_save(flags);
  1128. if (timer_pending(t)) {
  1129. if (time_before(when, t->expires))
  1130. mod_timer_pinned(t, when);
  1131. } else {
  1132. t->expires = round_jiffies(when);
  1133. add_timer_on(t, smp_processor_id());
  1134. }
  1135. local_irq_restore(flags);
  1136. }
  1137. static void mce_timer_fn(unsigned long data)
  1138. {
  1139. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1140. int cpu = smp_processor_id();
  1141. unsigned long iv;
  1142. WARN_ON(cpu != data);
  1143. iv = __this_cpu_read(mce_next_interval);
  1144. if (mce_available(this_cpu_ptr(&cpu_info))) {
  1145. machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
  1146. if (mce_intel_cmci_poll()) {
  1147. iv = mce_adjust_timer(iv);
  1148. goto done;
  1149. }
  1150. }
  1151. /*
  1152. * Alert userspace if needed. If we logged an MCE, reduce the polling
  1153. * interval, otherwise increase the polling interval.
  1154. */
  1155. if (mce_notify_irq())
  1156. iv = max(iv / 2, (unsigned long) HZ/100);
  1157. else
  1158. iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
  1159. done:
  1160. __this_cpu_write(mce_next_interval, iv);
  1161. __restart_timer(t, iv);
  1162. }
  1163. /*
  1164. * Ensure that the timer is firing in @interval from now.
  1165. */
  1166. void mce_timer_kick(unsigned long interval)
  1167. {
  1168. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1169. unsigned long iv = __this_cpu_read(mce_next_interval);
  1170. __restart_timer(t, interval);
  1171. if (interval < iv)
  1172. __this_cpu_write(mce_next_interval, interval);
  1173. }
  1174. /* Must not be called in IRQ context where del_timer_sync() can deadlock */
  1175. static void mce_timer_delete_all(void)
  1176. {
  1177. int cpu;
  1178. for_each_online_cpu(cpu)
  1179. del_timer_sync(&per_cpu(mce_timer, cpu));
  1180. }
  1181. static void mce_do_trigger(struct work_struct *work)
  1182. {
  1183. call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
  1184. }
  1185. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  1186. /*
  1187. * Notify the user(s) about new machine check events.
  1188. * Can be called from interrupt context, but not from machine check/NMI
  1189. * context.
  1190. */
  1191. int mce_notify_irq(void)
  1192. {
  1193. /* Not more than two messages every minute */
  1194. static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  1195. if (test_and_clear_bit(0, &mce_need_notify)) {
  1196. /* wake processes polling /dev/mcelog */
  1197. wake_up_interruptible(&mce_chrdev_wait);
  1198. if (mce_helper[0])
  1199. schedule_work(&mce_trigger_work);
  1200. if (__ratelimit(&ratelimit))
  1201. pr_info(HW_ERR "Machine check events logged\n");
  1202. return 1;
  1203. }
  1204. return 0;
  1205. }
  1206. EXPORT_SYMBOL_GPL(mce_notify_irq);
  1207. static int __mcheck_cpu_mce_banks_init(void)
  1208. {
  1209. int i;
  1210. u8 num_banks = mca_cfg.banks;
  1211. mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
  1212. if (!mce_banks)
  1213. return -ENOMEM;
  1214. for (i = 0; i < num_banks; i++) {
  1215. struct mce_bank *b = &mce_banks[i];
  1216. b->ctl = -1ULL;
  1217. b->init = 1;
  1218. }
  1219. return 0;
  1220. }
  1221. /*
  1222. * Initialize Machine Checks for a CPU.
  1223. */
  1224. static int __mcheck_cpu_cap_init(void)
  1225. {
  1226. unsigned b;
  1227. u64 cap;
  1228. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1229. b = cap & MCG_BANKCNT_MASK;
  1230. if (!mca_cfg.banks)
  1231. pr_info("CPU supports %d MCE banks\n", b);
  1232. if (b > MAX_NR_BANKS) {
  1233. pr_warn("Using only %u machine check banks out of %u\n",
  1234. MAX_NR_BANKS, b);
  1235. b = MAX_NR_BANKS;
  1236. }
  1237. /* Don't support asymmetric configurations today */
  1238. WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
  1239. mca_cfg.banks = b;
  1240. if (!mce_banks) {
  1241. int err = __mcheck_cpu_mce_banks_init();
  1242. if (err)
  1243. return err;
  1244. }
  1245. /* Use accurate RIP reporting if available. */
  1246. if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
  1247. mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
  1248. if (cap & MCG_SER_P)
  1249. mca_cfg.ser = true;
  1250. return 0;
  1251. }
  1252. static void __mcheck_cpu_init_generic(void)
  1253. {
  1254. enum mcp_flags m_fl = 0;
  1255. mce_banks_t all_banks;
  1256. u64 cap;
  1257. int i;
  1258. if (!mca_cfg.bootlog)
  1259. m_fl = MCP_DONTLOG;
  1260. /*
  1261. * Log the machine checks left over from the previous reset.
  1262. */
  1263. bitmap_fill(all_banks, MAX_NR_BANKS);
  1264. machine_check_poll(MCP_UC | m_fl, &all_banks);
  1265. cr4_set_bits(X86_CR4_MCE);
  1266. rdmsrl(MSR_IA32_MCG_CAP, cap);
  1267. if (cap & MCG_CTL_P)
  1268. wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  1269. for (i = 0; i < mca_cfg.banks; i++) {
  1270. struct mce_bank *b = &mce_banks[i];
  1271. if (!b->init)
  1272. continue;
  1273. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  1274. wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
  1275. }
  1276. }
  1277. /*
  1278. * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
  1279. * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
  1280. * Vol 3B Table 15-20). But this confuses both the code that determines
  1281. * whether the machine check occurred in kernel or user mode, and also
  1282. * the severity assessment code. Pretend that EIPV was set, and take the
  1283. * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
  1284. */
  1285. static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
  1286. {
  1287. if (bank != 0)
  1288. return;
  1289. if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
  1290. return;
  1291. if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
  1292. MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
  1293. MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
  1294. MCACOD)) !=
  1295. (MCI_STATUS_UC|MCI_STATUS_EN|
  1296. MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
  1297. MCI_STATUS_AR|MCACOD_INSTR))
  1298. return;
  1299. m->mcgstatus |= MCG_STATUS_EIPV;
  1300. m->ip = regs->ip;
  1301. m->cs = regs->cs;
  1302. }
  1303. /* Add per CPU specific workarounds here */
  1304. static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
  1305. {
  1306. struct mca_config *cfg = &mca_cfg;
  1307. if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
  1308. pr_info("unknown CPU type - not enabling MCE support\n");
  1309. return -EOPNOTSUPP;
  1310. }
  1311. /* This should be disabled by the BIOS, but isn't always */
  1312. if (c->x86_vendor == X86_VENDOR_AMD) {
  1313. if (c->x86 == 15 && cfg->banks > 4) {
  1314. /*
  1315. * disable GART TBL walk error reporting, which
  1316. * trips off incorrectly with the IOMMU & 3ware
  1317. * & Cerberus:
  1318. */
  1319. clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
  1320. }
  1321. if (c->x86 <= 17 && cfg->bootlog < 0) {
  1322. /*
  1323. * Lots of broken BIOS around that don't clear them
  1324. * by default and leave crap in there. Don't log:
  1325. */
  1326. cfg->bootlog = 0;
  1327. }
  1328. /*
  1329. * Various K7s with broken bank 0 around. Always disable
  1330. * by default.
  1331. */
  1332. if (c->x86 == 6 && cfg->banks > 0)
  1333. mce_banks[0].ctl = 0;
  1334. /*
  1335. * overflow_recov is supported for F15h Models 00h-0fh
  1336. * even though we don't have a CPUID bit for it.
  1337. */
  1338. if (c->x86 == 0x15 && c->x86_model <= 0xf)
  1339. mce_flags.overflow_recov = 1;
  1340. /*
  1341. * Turn off MC4_MISC thresholding banks on those models since
  1342. * they're not supported there.
  1343. */
  1344. if (c->x86 == 0x15 &&
  1345. (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
  1346. int i;
  1347. u64 hwcr;
  1348. bool need_toggle;
  1349. u32 msrs[] = {
  1350. 0x00000413, /* MC4_MISC0 */
  1351. 0xc0000408, /* MC4_MISC1 */
  1352. };
  1353. rdmsrl(MSR_K7_HWCR, hwcr);
  1354. /* McStatusWrEn has to be set */
  1355. need_toggle = !(hwcr & BIT(18));
  1356. if (need_toggle)
  1357. wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
  1358. /* Clear CntP bit safely */
  1359. for (i = 0; i < ARRAY_SIZE(msrs); i++)
  1360. msr_clear_bit(msrs[i], 62);
  1361. /* restore old settings */
  1362. if (need_toggle)
  1363. wrmsrl(MSR_K7_HWCR, hwcr);
  1364. }
  1365. }
  1366. if (c->x86_vendor == X86_VENDOR_INTEL) {
  1367. /*
  1368. * SDM documents that on family 6 bank 0 should not be written
  1369. * because it aliases to another special BIOS controlled
  1370. * register.
  1371. * But it's not aliased anymore on model 0x1a+
  1372. * Don't ignore bank 0 completely because there could be a
  1373. * valid event later, merely don't write CTL0.
  1374. */
  1375. if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
  1376. mce_banks[0].init = 0;
  1377. /*
  1378. * All newer Intel systems support MCE broadcasting. Enable
  1379. * synchronization with a one second timeout.
  1380. */
  1381. if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
  1382. cfg->monarch_timeout < 0)
  1383. cfg->monarch_timeout = USEC_PER_SEC;
  1384. /*
  1385. * There are also broken BIOSes on some Pentium M and
  1386. * earlier systems:
  1387. */
  1388. if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
  1389. cfg->bootlog = 0;
  1390. if (c->x86 == 6 && c->x86_model == 45)
  1391. quirk_no_way_out = quirk_sandybridge_ifu;
  1392. }
  1393. if (cfg->monarch_timeout < 0)
  1394. cfg->monarch_timeout = 0;
  1395. if (cfg->bootlog != 0)
  1396. cfg->panic_timeout = 30;
  1397. return 0;
  1398. }
  1399. static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
  1400. {
  1401. if (c->x86 != 5)
  1402. return 0;
  1403. switch (c->x86_vendor) {
  1404. case X86_VENDOR_INTEL:
  1405. intel_p5_mcheck_init(c);
  1406. return 1;
  1407. break;
  1408. case X86_VENDOR_CENTAUR:
  1409. winchip_mcheck_init(c);
  1410. return 1;
  1411. break;
  1412. }
  1413. return 0;
  1414. }
  1415. static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
  1416. {
  1417. switch (c->x86_vendor) {
  1418. case X86_VENDOR_INTEL:
  1419. mce_intel_feature_init(c);
  1420. mce_adjust_timer = cmci_intel_adjust_timer;
  1421. break;
  1422. case X86_VENDOR_AMD: {
  1423. u32 ebx = cpuid_ebx(0x80000007);
  1424. mce_amd_feature_init(c);
  1425. mce_flags.overflow_recov = !!(ebx & BIT(0));
  1426. mce_flags.succor = !!(ebx & BIT(1));
  1427. break;
  1428. }
  1429. default:
  1430. break;
  1431. }
  1432. }
  1433. static void mce_start_timer(unsigned int cpu, struct timer_list *t)
  1434. {
  1435. unsigned long iv = check_interval * HZ;
  1436. if (mca_cfg.ignore_ce || !iv)
  1437. return;
  1438. per_cpu(mce_next_interval, cpu) = iv;
  1439. t->expires = round_jiffies(jiffies + iv);
  1440. add_timer_on(t, cpu);
  1441. }
  1442. static void __mcheck_cpu_init_timer(void)
  1443. {
  1444. struct timer_list *t = this_cpu_ptr(&mce_timer);
  1445. unsigned int cpu = smp_processor_id();
  1446. setup_timer(t, mce_timer_fn, cpu);
  1447. mce_start_timer(cpu, t);
  1448. }
  1449. /* Handle unconfigured int18 (should never happen) */
  1450. static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  1451. {
  1452. pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
  1453. smp_processor_id());
  1454. }
  1455. /* Call the installed machine check handler for this CPU setup. */
  1456. void (*machine_check_vector)(struct pt_regs *, long error_code) =
  1457. unexpected_machine_check;
  1458. /*
  1459. * Called for each booted CPU to set up machine checks.
  1460. * Must be called with preempt off:
  1461. */
  1462. void mcheck_cpu_init(struct cpuinfo_x86 *c)
  1463. {
  1464. if (mca_cfg.disabled)
  1465. return;
  1466. if (__mcheck_cpu_ancient_init(c))
  1467. return;
  1468. if (!mce_available(c))
  1469. return;
  1470. if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
  1471. mca_cfg.disabled = true;
  1472. return;
  1473. }
  1474. machine_check_vector = do_machine_check;
  1475. __mcheck_cpu_init_generic();
  1476. __mcheck_cpu_init_vendor(c);
  1477. __mcheck_cpu_init_timer();
  1478. INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
  1479. init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
  1480. }
  1481. /*
  1482. * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  1483. */
  1484. static DEFINE_SPINLOCK(mce_chrdev_state_lock);
  1485. static int mce_chrdev_open_count; /* #times opened */
  1486. static int mce_chrdev_open_exclu; /* already open exclusive? */
  1487. static int mce_chrdev_open(struct inode *inode, struct file *file)
  1488. {
  1489. spin_lock(&mce_chrdev_state_lock);
  1490. if (mce_chrdev_open_exclu ||
  1491. (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
  1492. spin_unlock(&mce_chrdev_state_lock);
  1493. return -EBUSY;
  1494. }
  1495. if (file->f_flags & O_EXCL)
  1496. mce_chrdev_open_exclu = 1;
  1497. mce_chrdev_open_count++;
  1498. spin_unlock(&mce_chrdev_state_lock);
  1499. return nonseekable_open(inode, file);
  1500. }
  1501. static int mce_chrdev_release(struct inode *inode, struct file *file)
  1502. {
  1503. spin_lock(&mce_chrdev_state_lock);
  1504. mce_chrdev_open_count--;
  1505. mce_chrdev_open_exclu = 0;
  1506. spin_unlock(&mce_chrdev_state_lock);
  1507. return 0;
  1508. }
  1509. static void collect_tscs(void *data)
  1510. {
  1511. unsigned long *cpu_tsc = (unsigned long *)data;
  1512. rdtscll(cpu_tsc[smp_processor_id()]);
  1513. }
  1514. static int mce_apei_read_done;
  1515. /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
  1516. static int __mce_read_apei(char __user **ubuf, size_t usize)
  1517. {
  1518. int rc;
  1519. u64 record_id;
  1520. struct mce m;
  1521. if (usize < sizeof(struct mce))
  1522. return -EINVAL;
  1523. rc = apei_read_mce(&m, &record_id);
  1524. /* Error or no more MCE record */
  1525. if (rc <= 0) {
  1526. mce_apei_read_done = 1;
  1527. /*
  1528. * When ERST is disabled, mce_chrdev_read() should return
  1529. * "no record" instead of "no device."
  1530. */
  1531. if (rc == -ENODEV)
  1532. return 0;
  1533. return rc;
  1534. }
  1535. rc = -EFAULT;
  1536. if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
  1537. return rc;
  1538. /*
  1539. * In fact, we should have cleared the record after that has
  1540. * been flushed to the disk or sent to network in
  1541. * /sbin/mcelog, but we have no interface to support that now,
  1542. * so just clear it to avoid duplication.
  1543. */
  1544. rc = apei_clear_mce(record_id);
  1545. if (rc) {
  1546. mce_apei_read_done = 1;
  1547. return rc;
  1548. }
  1549. *ubuf += sizeof(struct mce);
  1550. return 0;
  1551. }
  1552. static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
  1553. size_t usize, loff_t *off)
  1554. {
  1555. char __user *buf = ubuf;
  1556. unsigned long *cpu_tsc;
  1557. unsigned prev, next;
  1558. int i, err;
  1559. cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
  1560. if (!cpu_tsc)
  1561. return -ENOMEM;
  1562. mutex_lock(&mce_chrdev_read_mutex);
  1563. if (!mce_apei_read_done) {
  1564. err = __mce_read_apei(&buf, usize);
  1565. if (err || buf != ubuf)
  1566. goto out;
  1567. }
  1568. next = rcu_dereference_check_mce(mcelog.next);
  1569. /* Only supports full reads right now */
  1570. err = -EINVAL;
  1571. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
  1572. goto out;
  1573. err = 0;
  1574. prev = 0;
  1575. do {
  1576. for (i = prev; i < next; i++) {
  1577. unsigned long start = jiffies;
  1578. struct mce *m = &mcelog.entry[i];
  1579. while (!m->finished) {
  1580. if (time_after_eq(jiffies, start + 2)) {
  1581. memset(m, 0, sizeof(*m));
  1582. goto timeout;
  1583. }
  1584. cpu_relax();
  1585. }
  1586. smp_rmb();
  1587. err |= copy_to_user(buf, m, sizeof(*m));
  1588. buf += sizeof(*m);
  1589. timeout:
  1590. ;
  1591. }
  1592. memset(mcelog.entry + prev, 0,
  1593. (next - prev) * sizeof(struct mce));
  1594. prev = next;
  1595. next = cmpxchg(&mcelog.next, prev, 0);
  1596. } while (next != prev);
  1597. synchronize_sched();
  1598. /*
  1599. * Collect entries that were still getting written before the
  1600. * synchronize.
  1601. */
  1602. on_each_cpu(collect_tscs, cpu_tsc, 1);
  1603. for (i = next; i < MCE_LOG_LEN; i++) {
  1604. struct mce *m = &mcelog.entry[i];
  1605. if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
  1606. err |= copy_to_user(buf, m, sizeof(*m));
  1607. smp_rmb();
  1608. buf += sizeof(*m);
  1609. memset(m, 0, sizeof(*m));
  1610. }
  1611. }
  1612. if (err)
  1613. err = -EFAULT;
  1614. out:
  1615. mutex_unlock(&mce_chrdev_read_mutex);
  1616. kfree(cpu_tsc);
  1617. return err ? err : buf - ubuf;
  1618. }
  1619. static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
  1620. {
  1621. poll_wait(file, &mce_chrdev_wait, wait);
  1622. if (READ_ONCE(mcelog.next))
  1623. return POLLIN | POLLRDNORM;
  1624. if (!mce_apei_read_done && apei_check_mce())
  1625. return POLLIN | POLLRDNORM;
  1626. return 0;
  1627. }
  1628. static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
  1629. unsigned long arg)
  1630. {
  1631. int __user *p = (int __user *)arg;
  1632. if (!capable(CAP_SYS_ADMIN))
  1633. return -EPERM;
  1634. switch (cmd) {
  1635. case MCE_GET_RECORD_LEN:
  1636. return put_user(sizeof(struct mce), p);
  1637. case MCE_GET_LOG_LEN:
  1638. return put_user(MCE_LOG_LEN, p);
  1639. case MCE_GETCLEAR_FLAGS: {
  1640. unsigned flags;
  1641. do {
  1642. flags = mcelog.flags;
  1643. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  1644. return put_user(flags, p);
  1645. }
  1646. default:
  1647. return -ENOTTY;
  1648. }
  1649. }
  1650. static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
  1651. size_t usize, loff_t *off);
  1652. void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
  1653. const char __user *ubuf,
  1654. size_t usize, loff_t *off))
  1655. {
  1656. mce_write = fn;
  1657. }
  1658. EXPORT_SYMBOL_GPL(register_mce_write_callback);
  1659. static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
  1660. size_t usize, loff_t *off)
  1661. {
  1662. if (mce_write)
  1663. return mce_write(filp, ubuf, usize, off);
  1664. else
  1665. return -EINVAL;
  1666. }
  1667. static const struct file_operations mce_chrdev_ops = {
  1668. .open = mce_chrdev_open,
  1669. .release = mce_chrdev_release,
  1670. .read = mce_chrdev_read,
  1671. .write = mce_chrdev_write,
  1672. .poll = mce_chrdev_poll,
  1673. .unlocked_ioctl = mce_chrdev_ioctl,
  1674. .llseek = no_llseek,
  1675. };
  1676. static struct miscdevice mce_chrdev_device = {
  1677. MISC_MCELOG_MINOR,
  1678. "mcelog",
  1679. &mce_chrdev_ops,
  1680. };
  1681. static void __mce_disable_bank(void *arg)
  1682. {
  1683. int bank = *((int *)arg);
  1684. __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
  1685. cmci_disable_bank(bank);
  1686. }
  1687. void mce_disable_bank(int bank)
  1688. {
  1689. if (bank >= mca_cfg.banks) {
  1690. pr_warn(FW_BUG
  1691. "Ignoring request to disable invalid MCA bank %d.\n",
  1692. bank);
  1693. return;
  1694. }
  1695. set_bit(bank, mce_banks_ce_disabled);
  1696. on_each_cpu(__mce_disable_bank, &bank, 1);
  1697. }
  1698. /*
  1699. * mce=off Disables machine check
  1700. * mce=no_cmci Disables CMCI
  1701. * mce=no_lmce Disables LMCE
  1702. * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
  1703. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  1704. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
  1705. * monarchtimeout is how long to wait for other CPUs on machine
  1706. * check, or 0 to not wait
  1707. * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
  1708. * mce=nobootlog Don't log MCEs from before booting.
  1709. * mce=bios_cmci_threshold Don't program the CMCI threshold
  1710. */
  1711. static int __init mcheck_enable(char *str)
  1712. {
  1713. struct mca_config *cfg = &mca_cfg;
  1714. if (*str == 0) {
  1715. enable_p5_mce();
  1716. return 1;
  1717. }
  1718. if (*str == '=')
  1719. str++;
  1720. if (!strcmp(str, "off"))
  1721. cfg->disabled = true;
  1722. else if (!strcmp(str, "no_cmci"))
  1723. cfg->cmci_disabled = true;
  1724. else if (!strcmp(str, "no_lmce"))
  1725. cfg->lmce_disabled = true;
  1726. else if (!strcmp(str, "dont_log_ce"))
  1727. cfg->dont_log_ce = true;
  1728. else if (!strcmp(str, "ignore_ce"))
  1729. cfg->ignore_ce = true;
  1730. else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
  1731. cfg->bootlog = (str[0] == 'b');
  1732. else if (!strcmp(str, "bios_cmci_threshold"))
  1733. cfg->bios_cmci_threshold = true;
  1734. else if (isdigit(str[0])) {
  1735. if (get_option(&str, &cfg->tolerant) == 2)
  1736. get_option(&str, &(cfg->monarch_timeout));
  1737. } else {
  1738. pr_info("mce argument %s ignored. Please use /sys\n", str);
  1739. return 0;
  1740. }
  1741. return 1;
  1742. }
  1743. __setup("mce", mcheck_enable);
  1744. int __init mcheck_init(void)
  1745. {
  1746. mcheck_intel_therm_init();
  1747. mcheck_vendor_init_severity();
  1748. return 0;
  1749. }
  1750. /*
  1751. * mce_syscore: PM support
  1752. */
  1753. /*
  1754. * Disable machine checks on suspend and shutdown. We can't really handle
  1755. * them later.
  1756. */
  1757. static int mce_disable_error_reporting(void)
  1758. {
  1759. int i;
  1760. for (i = 0; i < mca_cfg.banks; i++) {
  1761. struct mce_bank *b = &mce_banks[i];
  1762. if (b->init)
  1763. wrmsrl(MSR_IA32_MCx_CTL(i), 0);
  1764. }
  1765. return 0;
  1766. }
  1767. static int mce_syscore_suspend(void)
  1768. {
  1769. return mce_disable_error_reporting();
  1770. }
  1771. static void mce_syscore_shutdown(void)
  1772. {
  1773. mce_disable_error_reporting();
  1774. }
  1775. /*
  1776. * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
  1777. * Only one CPU is active at this time, the others get re-added later using
  1778. * CPU hotplug:
  1779. */
  1780. static void mce_syscore_resume(void)
  1781. {
  1782. __mcheck_cpu_init_generic();
  1783. __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
  1784. }
  1785. static struct syscore_ops mce_syscore_ops = {
  1786. .suspend = mce_syscore_suspend,
  1787. .shutdown = mce_syscore_shutdown,
  1788. .resume = mce_syscore_resume,
  1789. };
  1790. /*
  1791. * mce_device: Sysfs support
  1792. */
  1793. static void mce_cpu_restart(void *data)
  1794. {
  1795. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1796. return;
  1797. __mcheck_cpu_init_generic();
  1798. __mcheck_cpu_init_timer();
  1799. }
  1800. /* Reinit MCEs after user configuration changes */
  1801. static void mce_restart(void)
  1802. {
  1803. mce_timer_delete_all();
  1804. on_each_cpu(mce_cpu_restart, NULL, 1);
  1805. }
  1806. /* Toggle features for corrected errors */
  1807. static void mce_disable_cmci(void *data)
  1808. {
  1809. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1810. return;
  1811. cmci_clear();
  1812. }
  1813. static void mce_enable_ce(void *all)
  1814. {
  1815. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  1816. return;
  1817. cmci_reenable();
  1818. cmci_recheck();
  1819. if (all)
  1820. __mcheck_cpu_init_timer();
  1821. }
  1822. static struct bus_type mce_subsys = {
  1823. .name = "machinecheck",
  1824. .dev_name = "machinecheck",
  1825. };
  1826. DEFINE_PER_CPU(struct device *, mce_device);
  1827. void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
  1828. static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
  1829. {
  1830. return container_of(attr, struct mce_bank, attr);
  1831. }
  1832. static ssize_t show_bank(struct device *s, struct device_attribute *attr,
  1833. char *buf)
  1834. {
  1835. return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
  1836. }
  1837. static ssize_t set_bank(struct device *s, struct device_attribute *attr,
  1838. const char *buf, size_t size)
  1839. {
  1840. u64 new;
  1841. if (kstrtou64(buf, 0, &new) < 0)
  1842. return -EINVAL;
  1843. attr_to_bank(attr)->ctl = new;
  1844. mce_restart();
  1845. return size;
  1846. }
  1847. static ssize_t
  1848. show_trigger(struct device *s, struct device_attribute *attr, char *buf)
  1849. {
  1850. strcpy(buf, mce_helper);
  1851. strcat(buf, "\n");
  1852. return strlen(mce_helper) + 1;
  1853. }
  1854. static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
  1855. const char *buf, size_t siz)
  1856. {
  1857. char *p;
  1858. strncpy(mce_helper, buf, sizeof(mce_helper));
  1859. mce_helper[sizeof(mce_helper)-1] = 0;
  1860. p = strchr(mce_helper, '\n');
  1861. if (p)
  1862. *p = 0;
  1863. return strlen(mce_helper) + !!p;
  1864. }
  1865. static ssize_t set_ignore_ce(struct device *s,
  1866. struct device_attribute *attr,
  1867. const char *buf, size_t size)
  1868. {
  1869. u64 new;
  1870. if (kstrtou64(buf, 0, &new) < 0)
  1871. return -EINVAL;
  1872. if (mca_cfg.ignore_ce ^ !!new) {
  1873. if (new) {
  1874. /* disable ce features */
  1875. mce_timer_delete_all();
  1876. on_each_cpu(mce_disable_cmci, NULL, 1);
  1877. mca_cfg.ignore_ce = true;
  1878. } else {
  1879. /* enable ce features */
  1880. mca_cfg.ignore_ce = false;
  1881. on_each_cpu(mce_enable_ce, (void *)1, 1);
  1882. }
  1883. }
  1884. return size;
  1885. }
  1886. static ssize_t set_cmci_disabled(struct device *s,
  1887. struct device_attribute *attr,
  1888. const char *buf, size_t size)
  1889. {
  1890. u64 new;
  1891. if (kstrtou64(buf, 0, &new) < 0)
  1892. return -EINVAL;
  1893. if (mca_cfg.cmci_disabled ^ !!new) {
  1894. if (new) {
  1895. /* disable cmci */
  1896. on_each_cpu(mce_disable_cmci, NULL, 1);
  1897. mca_cfg.cmci_disabled = true;
  1898. } else {
  1899. /* enable cmci */
  1900. mca_cfg.cmci_disabled = false;
  1901. on_each_cpu(mce_enable_ce, NULL, 1);
  1902. }
  1903. }
  1904. return size;
  1905. }
  1906. static ssize_t store_int_with_restart(struct device *s,
  1907. struct device_attribute *attr,
  1908. const char *buf, size_t size)
  1909. {
  1910. ssize_t ret = device_store_int(s, attr, buf, size);
  1911. mce_restart();
  1912. return ret;
  1913. }
  1914. static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
  1915. static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
  1916. static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
  1917. static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
  1918. static struct dev_ext_attribute dev_attr_check_interval = {
  1919. __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
  1920. &check_interval
  1921. };
  1922. static struct dev_ext_attribute dev_attr_ignore_ce = {
  1923. __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
  1924. &mca_cfg.ignore_ce
  1925. };
  1926. static struct dev_ext_attribute dev_attr_cmci_disabled = {
  1927. __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
  1928. &mca_cfg.cmci_disabled
  1929. };
  1930. static struct device_attribute *mce_device_attrs[] = {
  1931. &dev_attr_tolerant.attr,
  1932. &dev_attr_check_interval.attr,
  1933. &dev_attr_trigger,
  1934. &dev_attr_monarch_timeout.attr,
  1935. &dev_attr_dont_log_ce.attr,
  1936. &dev_attr_ignore_ce.attr,
  1937. &dev_attr_cmci_disabled.attr,
  1938. NULL
  1939. };
  1940. static cpumask_var_t mce_device_initialized;
  1941. static void mce_device_release(struct device *dev)
  1942. {
  1943. kfree(dev);
  1944. }
  1945. /* Per cpu device init. All of the cpus still share the same ctrl bank: */
  1946. static int mce_device_create(unsigned int cpu)
  1947. {
  1948. struct device *dev;
  1949. int err;
  1950. int i, j;
  1951. if (!mce_available(&boot_cpu_data))
  1952. return -EIO;
  1953. dev = kzalloc(sizeof *dev, GFP_KERNEL);
  1954. if (!dev)
  1955. return -ENOMEM;
  1956. dev->id = cpu;
  1957. dev->bus = &mce_subsys;
  1958. dev->release = &mce_device_release;
  1959. err = device_register(dev);
  1960. if (err) {
  1961. put_device(dev);
  1962. return err;
  1963. }
  1964. for (i = 0; mce_device_attrs[i]; i++) {
  1965. err = device_create_file(dev, mce_device_attrs[i]);
  1966. if (err)
  1967. goto error;
  1968. }
  1969. for (j = 0; j < mca_cfg.banks; j++) {
  1970. err = device_create_file(dev, &mce_banks[j].attr);
  1971. if (err)
  1972. goto error2;
  1973. }
  1974. cpumask_set_cpu(cpu, mce_device_initialized);
  1975. per_cpu(mce_device, cpu) = dev;
  1976. return 0;
  1977. error2:
  1978. while (--j >= 0)
  1979. device_remove_file(dev, &mce_banks[j].attr);
  1980. error:
  1981. while (--i >= 0)
  1982. device_remove_file(dev, mce_device_attrs[i]);
  1983. device_unregister(dev);
  1984. return err;
  1985. }
  1986. static void mce_device_remove(unsigned int cpu)
  1987. {
  1988. struct device *dev = per_cpu(mce_device, cpu);
  1989. int i;
  1990. if (!cpumask_test_cpu(cpu, mce_device_initialized))
  1991. return;
  1992. for (i = 0; mce_device_attrs[i]; i++)
  1993. device_remove_file(dev, mce_device_attrs[i]);
  1994. for (i = 0; i < mca_cfg.banks; i++)
  1995. device_remove_file(dev, &mce_banks[i].attr);
  1996. device_unregister(dev);
  1997. cpumask_clear_cpu(cpu, mce_device_initialized);
  1998. per_cpu(mce_device, cpu) = NULL;
  1999. }
  2000. /* Make sure there are no machine checks on offlined CPUs. */
  2001. static void mce_disable_cpu(void *h)
  2002. {
  2003. unsigned long action = *(unsigned long *)h;
  2004. int i;
  2005. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  2006. return;
  2007. if (!(action & CPU_TASKS_FROZEN))
  2008. cmci_clear();
  2009. for (i = 0; i < mca_cfg.banks; i++) {
  2010. struct mce_bank *b = &mce_banks[i];
  2011. if (b->init)
  2012. wrmsrl(MSR_IA32_MCx_CTL(i), 0);
  2013. }
  2014. }
  2015. static void mce_reenable_cpu(void *h)
  2016. {
  2017. unsigned long action = *(unsigned long *)h;
  2018. int i;
  2019. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  2020. return;
  2021. if (!(action & CPU_TASKS_FROZEN))
  2022. cmci_reenable();
  2023. for (i = 0; i < mca_cfg.banks; i++) {
  2024. struct mce_bank *b = &mce_banks[i];
  2025. if (b->init)
  2026. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  2027. }
  2028. }
  2029. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  2030. static int
  2031. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  2032. {
  2033. unsigned int cpu = (unsigned long)hcpu;
  2034. struct timer_list *t = &per_cpu(mce_timer, cpu);
  2035. switch (action & ~CPU_TASKS_FROZEN) {
  2036. case CPU_ONLINE:
  2037. mce_device_create(cpu);
  2038. if (threshold_cpu_callback)
  2039. threshold_cpu_callback(action, cpu);
  2040. break;
  2041. case CPU_DEAD:
  2042. if (threshold_cpu_callback)
  2043. threshold_cpu_callback(action, cpu);
  2044. mce_device_remove(cpu);
  2045. mce_intel_hcpu_update(cpu);
  2046. /* intentionally ignoring frozen here */
  2047. if (!(action & CPU_TASKS_FROZEN))
  2048. cmci_rediscover();
  2049. break;
  2050. case CPU_DOWN_PREPARE:
  2051. smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
  2052. del_timer_sync(t);
  2053. break;
  2054. case CPU_DOWN_FAILED:
  2055. smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
  2056. mce_start_timer(cpu, t);
  2057. break;
  2058. }
  2059. return NOTIFY_OK;
  2060. }
  2061. static struct notifier_block mce_cpu_notifier = {
  2062. .notifier_call = mce_cpu_callback,
  2063. };
  2064. static __init void mce_init_banks(void)
  2065. {
  2066. int i;
  2067. for (i = 0; i < mca_cfg.banks; i++) {
  2068. struct mce_bank *b = &mce_banks[i];
  2069. struct device_attribute *a = &b->attr;
  2070. sysfs_attr_init(&a->attr);
  2071. a->attr.name = b->attrname;
  2072. snprintf(b->attrname, ATTR_LEN, "bank%d", i);
  2073. a->attr.mode = 0644;
  2074. a->show = show_bank;
  2075. a->store = set_bank;
  2076. }
  2077. }
  2078. static __init int mcheck_init_device(void)
  2079. {
  2080. int err;
  2081. int i = 0;
  2082. if (!mce_available(&boot_cpu_data)) {
  2083. err = -EIO;
  2084. goto err_out;
  2085. }
  2086. if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
  2087. err = -ENOMEM;
  2088. goto err_out;
  2089. }
  2090. mce_init_banks();
  2091. err = subsys_system_register(&mce_subsys, NULL);
  2092. if (err)
  2093. goto err_out_mem;
  2094. cpu_notifier_register_begin();
  2095. for_each_online_cpu(i) {
  2096. err = mce_device_create(i);
  2097. if (err) {
  2098. /*
  2099. * Register notifier anyway (and do not unreg it) so
  2100. * that we don't leave undeleted timers, see notifier
  2101. * callback above.
  2102. */
  2103. __register_hotcpu_notifier(&mce_cpu_notifier);
  2104. cpu_notifier_register_done();
  2105. goto err_device_create;
  2106. }
  2107. }
  2108. __register_hotcpu_notifier(&mce_cpu_notifier);
  2109. cpu_notifier_register_done();
  2110. register_syscore_ops(&mce_syscore_ops);
  2111. /* register character device /dev/mcelog */
  2112. err = misc_register(&mce_chrdev_device);
  2113. if (err)
  2114. goto err_register;
  2115. return 0;
  2116. err_register:
  2117. unregister_syscore_ops(&mce_syscore_ops);
  2118. err_device_create:
  2119. /*
  2120. * We didn't keep track of which devices were created above, but
  2121. * even if we had, the set of online cpus might have changed.
  2122. * Play safe and remove for every possible cpu, since
  2123. * mce_device_remove() will do the right thing.
  2124. */
  2125. for_each_possible_cpu(i)
  2126. mce_device_remove(i);
  2127. err_out_mem:
  2128. free_cpumask_var(mce_device_initialized);
  2129. err_out:
  2130. pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
  2131. return err;
  2132. }
  2133. device_initcall_sync(mcheck_init_device);
  2134. /*
  2135. * Old style boot options parsing. Only for compatibility.
  2136. */
  2137. static int __init mcheck_disable(char *str)
  2138. {
  2139. mca_cfg.disabled = true;
  2140. return 1;
  2141. }
  2142. __setup("nomce", mcheck_disable);
  2143. #ifdef CONFIG_DEBUG_FS
  2144. struct dentry *mce_get_debugfs_dir(void)
  2145. {
  2146. static struct dentry *dmce;
  2147. if (!dmce)
  2148. dmce = debugfs_create_dir("mce", NULL);
  2149. return dmce;
  2150. }
  2151. static void mce_reset(void)
  2152. {
  2153. cpu_missing = 0;
  2154. atomic_set(&mce_fake_panicked, 0);
  2155. atomic_set(&mce_executing, 0);
  2156. atomic_set(&mce_callin, 0);
  2157. atomic_set(&global_nwo, 0);
  2158. }
  2159. static int fake_panic_get(void *data, u64 *val)
  2160. {
  2161. *val = fake_panic;
  2162. return 0;
  2163. }
  2164. static int fake_panic_set(void *data, u64 val)
  2165. {
  2166. mce_reset();
  2167. fake_panic = val;
  2168. return 0;
  2169. }
  2170. DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
  2171. fake_panic_set, "%llu\n");
  2172. static int __init mcheck_debugfs_init(void)
  2173. {
  2174. struct dentry *dmce, *ffake_panic;
  2175. dmce = mce_get_debugfs_dir();
  2176. if (!dmce)
  2177. return -ENOMEM;
  2178. ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
  2179. &fake_panic_fops);
  2180. if (!ffake_panic)
  2181. return -ENOMEM;
  2182. return 0;
  2183. }
  2184. late_initcall(mcheck_debugfs_init);
  2185. #endif