|
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static int do_memory_failure(struct mce *m)
|
|
|
+{
|
|
|
+ int flags = MF_ACTION_REQUIRED;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
|
|
|
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
|
|
|
+ flags |= MF_MUST_KILL;
|
|
|
+ ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
|
|
|
+ if (ret)
|
|
|
+ pr_err("Memory error not recovered");
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* The actual machine check handler. This only handles real
|
|
|
* exceptions when something got corrupted coming in through int 18.
|
|
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
|
|
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
|
|
char *msg = "Unknown";
|
|
|
- u64 recover_paddr = ~0ull;
|
|
|
- int flags = MF_ACTION_REQUIRED;
|
|
|
int lmce = 0;
|
|
|
|
|
|
/* If this CPU is offline, just bail out. */
|
|
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * At insane "tolerant" levels we take no action. Otherwise
|
|
|
- * we only die if we have no other choice. For less serious
|
|
|
- * issues we try to recover, or limit damage to the current
|
|
|
- * process.
|
|
|
+ * If tolerant is at an insane level we drop requests to kill
|
|
|
+ * processes and continue even when there is no way out.
|
|
|
*/
|
|
|
- if (cfg->tolerant < 3) {
|
|
|
- if (no_way_out)
|
|
|
- mce_panic("Fatal machine check on current CPU", &m, msg);
|
|
|
- if (worst == MCE_AR_SEVERITY) {
|
|
|
- recover_paddr = m.addr;
|
|
|
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
|
|
- flags |= MF_MUST_KILL;
|
|
|
- } else if (kill_it) {
|
|
|
- force_sig(SIGBUS, current);
|
|
|
- }
|
|
|
- }
|
|
|
+ if (cfg->tolerant == 3)
|
|
|
+ kill_it = 0;
|
|
|
+ else if (no_way_out)
|
|
|
+ mce_panic("Fatal machine check on current CPU", &m, msg);
|
|
|
|
|
|
if (worst > 0)
|
|
|
mce_report_event(regs);
|
|
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
out:
|
|
|
sync_core();
|
|
|
|
|
|
- if (recover_paddr == ~0ull)
|
|
|
- goto done;
|
|
|
+ if (worst != MCE_AR_SEVERITY && !kill_it)
|
|
|
+ goto out_ist;
|
|
|
|
|
|
- pr_err("Uncorrected hardware memory error in user-access at %llx",
|
|
|
- recover_paddr);
|
|
|
- /*
|
|
|
- * We must call memory_failure() here even if the current process is
|
|
|
- * doomed. We still need to mark the page as poisoned and alert any
|
|
|
- * other users of the page.
|
|
|
- */
|
|
|
- ist_begin_non_atomic(regs);
|
|
|
- local_irq_enable();
|
|
|
- if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
|
|
|
- pr_err("Memory error not recovered");
|
|
|
- force_sig(SIGBUS, current);
|
|
|
+ /* Fault was in user mode and we need to take some action */
|
|
|
+ if ((m.cs & 3) == 3) {
|
|
|
+ ist_begin_non_atomic(regs);
|
|
|
+ local_irq_enable();
|
|
|
+
|
|
|
+ if (kill_it || do_memory_failure(&m))
|
|
|
+ force_sig(SIGBUS, current);
|
|
|
+ local_irq_disable();
|
|
|
+ ist_end_non_atomic();
|
|
|
+ } else {
|
|
|
+ if (!fixup_exception(regs, X86_TRAP_MC))
|
|
|
+ mce_panic("Failed kernel mode recovery", &m, NULL);
|
|
|
}
|
|
|
- local_irq_disable();
|
|
|
- ist_end_non_atomic();
|
|
|
-done:
|
|
|
+
|
|
|
+out_ist:
|
|
|
ist_exit(regs);
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(do_machine_check);
|