|
@@ -1003,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Need to save faulting physical address associated with a process
|
|
|
- * in the machine check handler some place where we can grab it back
|
|
|
- * later in mce_notify_process()
|
|
|
- */
|
|
|
-#define MCE_INFO_MAX 16
|
|
|
-
|
|
|
-struct mce_info {
|
|
|
- atomic_t inuse;
|
|
|
- struct task_struct *t;
|
|
|
- __u64 paddr;
|
|
|
- int restartable;
|
|
|
-} mce_info[MCE_INFO_MAX];
|
|
|
-
|
|
|
-static void mce_save_info(__u64 addr, int c)
|
|
|
-{
|
|
|
- struct mce_info *mi;
|
|
|
-
|
|
|
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
|
|
|
- if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
|
|
|
- mi->t = current;
|
|
|
- mi->paddr = addr;
|
|
|
- mi->restartable = c;
|
|
|
- return;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- mce_panic("Too many concurrent recoverable errors", NULL, NULL);
|
|
|
-}
|
|
|
-
|
|
|
-static struct mce_info *mce_find_info(void)
|
|
|
-{
|
|
|
- struct mce_info *mi;
|
|
|
-
|
|
|
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
|
|
|
- if (atomic_read(&mi->inuse) && mi->t == current)
|
|
|
- return mi;
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
-static void mce_clear_info(struct mce_info *mi)
|
|
|
-{
|
|
|
- atomic_set(&mi->inuse, 0);
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* The actual machine check handler. This only handles real
|
|
|
* exceptions when something got corrupted coming in through int 18.
|
|
@@ -1086,6 +1041,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
|
|
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
|
|
char *msg = "Unknown";
|
|
|
+ u64 recover_paddr = ~0ull;
|
|
|
+ int flags = MF_ACTION_REQUIRED;
|
|
|
|
|
|
prev_state = ist_enter(regs);
|
|
|
|
|
@@ -1207,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
if (no_way_out)
|
|
|
mce_panic("Fatal machine check on current CPU", &m, msg);
|
|
|
if (worst == MCE_AR_SEVERITY) {
|
|
|
- /* schedule action before return to userland */
|
|
|
- mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
|
|
|
- set_thread_flag(TIF_MCE_NOTIFY);
|
|
|
+ recover_paddr = m.addr;
|
|
|
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
|
|
+ flags |= MF_MUST_KILL;
|
|
|
} else if (kill_it) {
|
|
|
force_sig(SIGBUS, current);
|
|
|
}
|
|
@@ -1220,6 +1177,26 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|
|
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
|
|
out:
|
|
|
sync_core();
|
|
|
+
|
|
|
+ if (recover_paddr == ~0ull)
|
|
|
+ goto done;
|
|
|
+
|
|
|
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
|
|
|
+ recover_paddr);
|
|
|
+ /*
|
|
|
+ * We must call memory_failure() here even if the current process is
|
|
|
+ * doomed. We still need to mark the page as poisoned and alert any
|
|
|
+ * other users of the page.
|
|
|
+ */
|
|
|
+ ist_begin_non_atomic(regs);
|
|
|
+ local_irq_enable();
|
|
|
+ if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
|
|
|
+ pr_err("Memory error not recovered");
|
|
|
+ force_sig(SIGBUS, current);
|
|
|
+ }
|
|
|
+ local_irq_disable();
|
|
|
+ ist_end_non_atomic();
|
|
|
+done:
|
|
|
ist_exit(regs, prev_state);
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(do_machine_check);
|
|
@@ -1237,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
-/*
|
|
|
- * Called in process context that interrupted by MCE and marked with
|
|
|
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
|
|
|
- * This code is allowed to sleep.
|
|
|
- * Attempt possible recovery such as calling the high level VM handler to
|
|
|
- * process any corrupted pages, and kill/signal current process if required.
|
|
|
- * Action required errors are handled here.
|
|
|
- */
|
|
|
-void mce_notify_process(void)
|
|
|
-{
|
|
|
- unsigned long pfn;
|
|
|
- struct mce_info *mi = mce_find_info();
|
|
|
- int flags = MF_ACTION_REQUIRED;
|
|
|
-
|
|
|
- if (!mi)
|
|
|
- mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
|
|
|
- pfn = mi->paddr >> PAGE_SHIFT;
|
|
|
-
|
|
|
- clear_thread_flag(TIF_MCE_NOTIFY);
|
|
|
-
|
|
|
- pr_err("Uncorrected hardware memory error in user-access at %llx",
|
|
|
- mi->paddr);
|
|
|
- /*
|
|
|
- * We must call memory_failure() here even if the current process is
|
|
|
- * doomed. We still need to mark the page as poisoned and alert any
|
|
|
- * other users of the page.
|
|
|
- */
|
|
|
- if (!mi->restartable)
|
|
|
- flags |= MF_MUST_KILL;
|
|
|
- if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
|
|
|
- pr_err("Memory error not recovered");
|
|
|
- force_sig(SIGBUS, current);
|
|
|
- }
|
|
|
- mce_clear_info(mi);
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Action optional processing happens here (picking up
|
|
|
* from the list of faulting pages that do_machine_check()
|