|
@@ -1047,6 +1047,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
char *msg = "Unknown";
|
|
|
u64 recover_paddr = ~0ull;
|
|
|
int flags = MF_ACTION_REQUIRED;
|
|
|
+ int lmce = 0;
|
|
|
|
|
|
prev_state = ist_enter(regs);
|
|
|
|
|
@@ -1074,11 +1075,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
kill_it = 1;
|
|
|
|
|
|
/*
|
|
|
- * Go through all the banks in exclusion of the other CPUs.
|
|
|
- * This way we don't report duplicated events on shared banks
|
|
|
- * because the first one to see it will clear it.
|
|
|
+ * Check if this MCE is signaled to only this logical processor
|
|
|
*/
|
|
|
- order = mce_start(&no_way_out);
|
|
|
+ if (m.mcgstatus & MCG_STATUS_LMCES)
|
|
|
+ lmce = 1;
|
|
|
+ else {
|
|
|
+ /*
|
|
|
+ * Go through all the banks in exclusion of the other CPUs.
|
|
|
+ * This way we don't report duplicated events on shared banks
|
|
|
+ * because the first one to see it will clear it.
|
|
|
+ * If this is a Local MCE, then no need to perform rendezvous.
|
|
|
+ */
|
|
|
+ order = mce_start(&no_way_out);
|
|
|
+ }
|
|
|
+
|
|
|
for (i = 0; i < cfg->banks; i++) {
|
|
|
__clear_bit(i, toclear);
|
|
|
if (!test_bit(i, valid_banks))
|
|
@@ -1155,8 +1165,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
* Do most of the synchronization with other CPUs.
|
|
|
* When there's any problem use only local no_way_out state.
|
|
|
*/
|
|
|
- if (mce_end(order) < 0)
|
|
|
- no_way_out = worst >= MCE_PANIC_SEVERITY;
|
|
|
+ if (!lmce) {
|
|
|
+ if (mce_end(order) < 0)
|
|
|
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Local MCE skipped calling mce_reign()
|
|
|
+ * If we found a fatal error, we need to panic here.
|
|
|
+ */
|
|
|
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
|
|
|
+ mce_panic("Machine check from unknown source",
|
|
|
+ NULL, NULL);
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* At insane "tolerant" levels we take no action. Otherwise
|