Browse Source

Merge branch 'x86/mce' into x86/ras

Pursue a single RAS/MCE topic branch on x86.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 12 years ago
parent
commit
0237d7f355

+ 5 - 0
Documentation/x86/x86_64/boot-options.txt

@@ -176,6 +176,11 @@ ACPI
 
 
   acpi=noirq	Don't route interrupts
   acpi=noirq	Don't route interrupts
 
 
+  acpi=nocmcff	Disable firmware first mode for corrected errors. This
+		disables parsing the HEST CMC error source to check if
+		firmware has set the FF flag. This may result in
+		duplicate corrected error reports.
+
 PCI
 PCI
 
 
   pci=off		Don't use PCI
   pci=off		Don't use PCI

+ 2 - 0
arch/x86/include/asm/acpi.h

@@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
 extern int acpi_use_timer_override;
 extern int acpi_fix_pin2_polarity;
 extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
 
 
 extern u8 acpi_sci_flags;
 extern u8 acpi_sci_flags;
 extern int acpi_sci_override_gsi;
 extern int acpi_sci_override_gsi;
@@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
 
 
 #define acpi_lapic 0
 #define acpi_lapic 0
 #define acpi_ioapic 0
 #define acpi_ioapic 0
+#define acpi_disable_cmcff 0
 static inline void acpi_noirq_set(void) { }
 static inline void acpi_noirq_set(void) { }
 static inline void acpi_disable_pci(void) { }
 static inline void acpi_disable_pci(void) { }
 static inline void disable_acpi(void) { }
 static inline void disable_acpi(void) { }

+ 3 - 0
arch/x86/include/asm/mce.h

@@ -188,6 +188,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
 				    const char __user *ubuf,
 				    const char __user *ubuf,
 				    size_t usize, loff_t *off));
 				    size_t usize, loff_t *off));
 
 
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_bank(int bank);
+
 /*
 /*
  * Exception handler
  * Exception handler
  */
  */

+ 5 - 0
arch/x86/kernel/acpi/boot.c

@@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
 int acpi_lapic;
 int acpi_lapic;
 int acpi_ioapic;
 int acpi_ioapic;
 int acpi_strict;
 int acpi_strict;
+int acpi_disable_cmcff;
 
 
 u8 acpi_sci_flags __initdata;
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_sci_override_gsi __initdata;
@@ -1626,6 +1627,10 @@ static int __init parse_acpi(char *arg)
 	/* "acpi=copy_dsdt" copys DSDT */
 	/* "acpi=copy_dsdt" copys DSDT */
 	else if (strcmp(arg, "copy_dsdt") == 0) {
 	else if (strcmp(arg, "copy_dsdt") == 0) {
 		acpi_gbl_copy_dsdt_locally = 1;
 		acpi_gbl_copy_dsdt_locally = 1;
+	}
+	/* "acpi=nocmcff" disables FF mode for corrected errors */
+	else if (strcmp(arg, "nocmcff") == 0) {
+		acpi_disable_cmcff = 1;
 	} else {
 	} else {
 		/* Core will printk when we return error. */
 		/* Core will printk when we return error. */
 		return -EINVAL;
 		return -EINVAL;

+ 3 - 0
arch/x86/kernel/cpu/mcheck/mce-internal.h

@@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
 struct dentry *mce_get_debugfs_dir(void);
 struct dentry *mce_get_debugfs_dir(void);
 
 
 extern struct mce_bank *mce_banks;
 extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
 
 
 #ifdef CONFIG_X86_MCE_INTEL
 #ifdef CONFIG_X86_MCE_INTEL
 unsigned long mce_intel_adjust_timer(unsigned long interval);
 unsigned long mce_intel_adjust_timer(unsigned long interval);
 void mce_intel_cmci_poll(void);
 void mce_intel_cmci_poll(void);
 void mce_intel_hcpu_update(unsigned long cpu);
 void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
 #else
 #else
 # define mce_intel_adjust_timer mce_adjust_timer_default
 # define mce_intel_adjust_timer mce_adjust_timer_default
 static inline void mce_intel_cmci_poll(void) { }
 static inline void mce_intel_cmci_poll(void) { }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
 #endif
 #endif
 
 
 void mce_timer_kick(unsigned long interval);
 void mce_timer_kick(unsigned long interval);

+ 28 - 0
arch/x86/kernel/cpu/mcheck/mce.c

@@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
 };
 
 
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = {
 	&mce_chrdev_ops,
 	&mce_chrdev_ops,
 };
 };
 
 
+static void __mce_disable_bank(void *arg)
+{
+	int bank = *((int *)arg);
+	__clear_bit(bank, __get_cpu_var(mce_poll_banks));
+	cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+	if (bank >= mca_cfg.banks) {
+		pr_warn(FW_BUG
+			"Ignoring request to disable invalid MCA bank %d.\n",
+			bank);
+		return;
+	}
+	set_bit(bank, mce_banks_ce_disabled);
+	on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
 /*
 /*
  * mce=off Disables machine check
  * mce=off Disables machine check
  * mce=no_cmci Disables CMCI
  * mce=no_cmci Disables CMCI

+ 32 - 10
arch/x86/kernel/cpu/mcheck/mce_intel.c

@@ -203,6 +203,10 @@ static void cmci_discover(int banks)
 		if (test_bit(i, owned))
 		if (test_bit(i, owned))
 			continue;
 			continue;
 
 
+		/* Skip banks in firmware first mode */
+		if (test_bit(i, mce_banks_ce_disabled))
+			continue;
+
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 
 		/* Already owned by someone else? */
 		/* Already owned by someone else? */
@@ -271,6 +275,19 @@ void cmci_recheck(void)
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 }
 }
 
 
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+	u64 val;
+
+	if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+		return;
+	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	val &= ~MCI_CTL2_CMCI_EN;
+	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	__clear_bit(bank, __get_cpu_var(mce_banks_owned));
+}
+
 /*
 /*
  * Disable CMCI on this CPU for all banks it owns when it goes down.
  * Disable CMCI on this CPU for all banks it owns when it goes down.
  * This allows other CPUs to claim the banks on rediscovery.
  * This allows other CPUs to claim the banks on rediscovery.
@@ -280,20 +297,12 @@ void cmci_clear(void)
 	unsigned long flags;
 	unsigned long flags;
 	int i;
 	int i;
 	int banks;
 	int banks;
-	u64 val;
 
 
 	if (!cmci_supported(&banks))
 	if (!cmci_supported(&banks))
 		return;
 		return;
 	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
 	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-	for (i = 0; i < banks; i++) {
-		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
-			continue;
-		/* Disable CMCI */
-		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-		val &= ~MCI_CTL2_CMCI_EN;
-		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
-		__clear_bit(i, __get_cpu_var(mce_banks_owned));
-	}
+	for (i = 0; i < banks; i++)
+		__cmci_disable_bank(i);
 	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
 	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
 }
 }
 
 
@@ -327,6 +336,19 @@ void cmci_reenable(void)
 		cmci_discover(banks);
 		cmci_discover(banks);
 }
 }
 
 
+void cmci_disable_bank(int bank)
+{
+	int banks;
+	unsigned long flags;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	__cmci_disable_bank(bank);
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 static void intel_init_cmci(void)
 static void intel_init_cmci(void)
 {
 {
 	int banks;
 	int banks;

+ 29 - 9
drivers/acpi/apei/ghes.c

@@ -409,6 +409,34 @@ static void ghes_clear_estatus(struct ghes *ghes)
 	ghes->flags &= ~GHES_TO_CLEAR;
 	ghes->flags &= ~GHES_TO_CLEAR;
 }
 }
 
 
+static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
+{
+#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
+	unsigned long pfn;
+	int sec_sev = ghes_severity(gdata->error_severity);
+	struct cper_sec_mem_err *mem_err;
+	mem_err = (struct cper_sec_mem_err *)(gdata + 1);
+
+	if (sec_sev == GHES_SEV_CORRECTED &&
+	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
+	    (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
+		pfn = mem_err->physical_addr >> PAGE_SHIFT;
+		if (pfn_valid(pfn))
+			memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+		else if (printk_ratelimit())
+			pr_warn(FW_WARN GHES_PFX
+			"Invalid address in generic error data: %#llx\n",
+			mem_err->physical_addr);
+	}
+	if (sev == GHES_SEV_RECOVERABLE &&
+	    sec_sev == GHES_SEV_RECOVERABLE &&
+	    mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+		pfn = mem_err->physical_addr >> PAGE_SHIFT;
+		memory_failure_queue(pfn, 0, 0);
+	}
+#endif
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 			 const struct acpi_hest_generic_status *estatus)
 {
 {
@@ -428,15 +456,7 @@ static void ghes_do_proc(struct ghes *ghes,
 			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
 			apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
 						  mem_err);
 						  mem_err);
 #endif
 #endif
-#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
-			if (sev == GHES_SEV_RECOVERABLE &&
-			    sec_sev == GHES_SEV_RECOVERABLE &&
-			    mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
-				unsigned long pfn;
-				pfn = mem_err->physical_addr >> PAGE_SHIFT;
-				memory_failure_queue(pfn, 0, 0);
-			}
-#endif
+			ghes_handle_memory_failure(gdata, sev);
 		}
 		}
 #ifdef CONFIG_ACPI_APEI_PCIEAER
 #ifdef CONFIG_ACPI_APEI_PCIEAER
 		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
 		else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,

+ 38 - 0
drivers/acpi/apei/hest.c

@@ -36,6 +36,7 @@
 #include <linux/io.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
 #include <acpi/apei.h>
 #include <acpi/apei.h>
+#include <asm/mce.h>
 
 
 #include "apei-internal.h"
 #include "apei-internal.h"
 
 
@@ -121,6 +122,40 @@ int apei_hest_parse(apei_hest_func_t func, void *data)
 }
 }
 EXPORT_SYMBOL_GPL(apei_hest_parse);
 EXPORT_SYMBOL_GPL(apei_hest_parse);
 
 
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data)
+{
+	int i;
+	struct acpi_hest_ia_corrected *cmc;
+	struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+	if (!cmc->enabled)
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that report errors
+	 * in firmware first mode. Otherwise, return non-zero value to
+	 * indicate that we are done parsing HEST.
+	 */
+	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+		return 1;
+
+	pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_bank(mc_bank->bank_number);
+
+	return 1;
+}
+
 struct ghes_arr {
 struct ghes_arr {
 	struct platform_device **ghes_devs;
 	struct platform_device **ghes_devs;
 	unsigned int count;
 	unsigned int count;
@@ -227,6 +262,9 @@ void __init acpi_hest_init(void)
 		goto err;
 		goto err;
 	}
 	}
 
 
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
+
 	if (!ghes_disable) {
 	if (!ghes_disable) {
 		rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
 		rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
 		if (rc)
 		if (rc)

+ 1 - 0
include/linux/mm.h

@@ -1798,6 +1798,7 @@ enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
 	MF_COUNT_INCREASED = 1 << 0,
 	MF_ACTION_REQUIRED = 1 << 1,
 	MF_ACTION_REQUIRED = 1 << 1,
 	MF_MUST_KILL = 1 << 2,
 	MF_MUST_KILL = 1 << 2,
+	MF_SOFT_OFFLINE = 1 << 3,
 };
 };
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);

+ 4 - 1
mm/memory-failure.c

@@ -1286,7 +1286,10 @@ static void memory_failure_work_func(struct work_struct *work)
 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 		if (!gotten)
 		if (!gotten)
 			break;
 			break;
-		memory_failure(entry.pfn, entry.trapno, entry.flags);
+		if (entry.flags & MF_SOFT_OFFLINE)
+			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+		else
+			memory_failure(entry.pfn, entry.trapno, entry.flags);
 	}
 	}
 }
 }