Browse Source

Merge tag 'ras_for_3.14' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp into x86/ras

Pull RAS updates from Borislav Petkov:

  * Add the functionality to override error reporting agents as some
  machines are sporting a new extended error logging capability which, if
  done properly in the BIOS, makes a corresponding EDAC module redundant,
  from Gong Chen.

  * PCIe AER tracepoint severity levels fix, from Rui Wang.

  * Error path correction for the mce device init, from Levente Kurusa.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 11 years ago
parent
commit
014952270e

+ 8 - 0
Documentation/kernel-parameters.txt

@@ -881,6 +881,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 
 			The xen output can only be used by Xen PV guests.
 			The xen output can only be used by Xen PV guests.
 
 
+	edac_report=	[HW,EDAC] Control how to report EDAC event
+			Format: {"on" | "off" | "force"}
+			on: enable EDAC to report H/W event. May be overridden
+			by other higher priority error reporting module.
+			off: disable H/W event reporting through EDAC.
+			force: enforce the use of EDAC to report H/W event.
+			default: on.
+
 	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
 	ekgdboc=	[X86,KGDB] Allow early kernel console debugging
 			ekgdboc=kbd
 			ekgdboc=kbd
 
 

+ 3 - 1
arch/x86/kernel/cpu/mcheck/mce.c

@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
 	dev->release = &mce_device_release;
 	dev->release = &mce_device_release;
 
 
 	err = device_register(dev);
 	err = device_register(dev);
-	if (err)
+	if (err) {
+		put_device(dev);
 		return err;
 		return err;
+	}
 
 
 	for (i = 0; mce_device_attrs[i]; i++) {
 	for (i = 0; mce_device_attrs[i]; i++) {
 		err = device_create_file(dev, mce_device_attrs[i]);
 		err = device_create_file(dev, mce_device_attrs[i]);

+ 16 - 2
drivers/acpi/acpi_extlog.c

@@ -12,6 +12,7 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_bus.h>
 #include <linux/cper.h>
 #include <linux/cper.h>
 #include <linux/ratelimit.h>
 #include <linux/ratelimit.h>
+#include <linux/edac.h>
 #include <asm/cpu.h>
 #include <asm/cpu.h>
 #include <asm/mce.h>
 #include <asm/mce.h>
 
 
@@ -43,6 +44,8 @@ struct extlog_l1_head {
 	u8  rev1[12];
 	u8  rev1[12];
 };
 };
 
 
+static int old_edac_report_status;
+
 static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";
 static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";
 
 
 /* L1 table related physical address */
 /* L1 table related physical address */
@@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
 
 
 	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
 	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
 
 
-	return NOTIFY_DONE;
+	return NOTIFY_STOP;
 }
 }
 
 
 static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
 static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
@@ -231,8 +234,12 @@ static int __init extlog_init(void)
 	u64 cap;
 	u64 cap;
 	int rc;
 	int rc;
 
 
-	rc = -ENODEV;
+	if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
+		pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
+		return -EPERM;
+	}
 
 
+	rc = -ENODEV;
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (!(cap & MCG_ELOG_P))
 	if (!(cap & MCG_ELOG_P))
 		return rc;
 		return rc;
@@ -287,6 +294,12 @@ static int __init extlog_init(void)
 	if (elog_buf == NULL)
 	if (elog_buf == NULL)
 		goto err_release_elog;
 		goto err_release_elog;
 
 
+	/*
+	 * eMCA event report method has higher priority than EDAC method,
+	 * unless EDAC event report method is mandatory.
+	 */
+	old_edac_report_status = get_edac_report_status();
+	set_edac_report_status(EDAC_REPORTING_DISABLED);
 	mce_register_decode_chain(&extlog_mce_dec);
 	mce_register_decode_chain(&extlog_mce_dec);
 	/* enable OS to be involved to take over management from BIOS */
 	/* enable OS to be involved to take over management from BIOS */
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,6 +321,7 @@ err:
 
 
 static void __exit extlog_exit(void)
 static void __exit extlog_exit(void)
 {
 {
+	set_edac_report_status(old_edac_report_status);
 	mce_unregister_decode_chain(&extlog_mce_dec);
 	mce_unregister_decode_chain(&extlog_mce_dec);
 	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
 	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
 	if (extlog_l1_addr)
 	if (extlog_l1_addr)

+ 19 - 0
drivers/edac/edac_stub.c

@@ -29,6 +29,25 @@ EXPORT_SYMBOL_GPL(edac_err_assert);
 
 
 static atomic_t edac_subsys_valid = ATOMIC_INIT(0);
 static atomic_t edac_subsys_valid = ATOMIC_INIT(0);
 
 
+int edac_report_status = EDAC_REPORTING_ENABLED;
+EXPORT_SYMBOL_GPL(edac_report_status);
+
+static int __init edac_report_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2))
+		set_edac_report_status(EDAC_REPORTING_ENABLED);
+	else if (!strncmp(str, "off", 3))
+		set_edac_report_status(EDAC_REPORTING_DISABLED);
+	else if (!strncmp(str, "force", 5))
+		set_edac_report_status(EDAC_REPORTING_FORCE);
+
+	return 0;
+}
+__setup("edac_report=", edac_report_setup);
+
 /*
 /*
  * called to determine if there is an EDAC driver interested in
  * called to determine if there is an EDAC driver interested in
  * knowing an event (such as NMI) occurred
  * knowing an event (such as NMI) occurred

+ 5 - 1
drivers/edac/sb_edac.c

@@ -1829,6 +1829,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
 	struct mem_ctl_info *mci;
 	struct mem_ctl_info *mci;
 	struct sbridge_pvt *pvt;
 	struct sbridge_pvt *pvt;
 
 
+	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+		return NOTIFY_DONE;
+
 	mci = get_mci_for_node_id(mce->socketid);
 	mci = get_mci_for_node_id(mce->socketid);
 	if (!mci)
 	if (!mci)
 		return NOTIFY_BAD;
 		return NOTIFY_BAD;
@@ -2142,9 +2145,10 @@ static int __init sbridge_init(void)
 	opstate_init();
 	opstate_init();
 
 
 	pci_rc = pci_register_driver(&sbridge_driver);
 	pci_rc = pci_register_driver(&sbridge_driver);
-
 	if (pci_rc >= 0) {
 	if (pci_rc >= 0) {
 		mce_register_decode_chain(&sbridge_mce_dec);
 		mce_register_decode_chain(&sbridge_mce_dec);
+		if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+			sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
 		return 0;
 		return 0;
 	}
 	}
 
 

+ 28 - 0
include/linux/edac.h

@@ -35,6 +35,34 @@ extern void edac_atomic_assert_error(void);
 extern struct bus_type *edac_get_sysfs_subsys(void);
 extern struct bus_type *edac_get_sysfs_subsys(void);
 extern void edac_put_sysfs_subsys(void);
 extern void edac_put_sysfs_subsys(void);
 
 
+enum {
+	EDAC_REPORTING_ENABLED,
+	EDAC_REPORTING_DISABLED,
+	EDAC_REPORTING_FORCE
+};
+
+extern int edac_report_status;
+#ifdef CONFIG_EDAC
+static inline int get_edac_report_status(void)
+{
+	return edac_report_status;
+}
+
+static inline void set_edac_report_status(int new)
+{
+	edac_report_status = new;
+}
+#else
+static inline int get_edac_report_status(void)
+{
+	return EDAC_REPORTING_DISABLED;
+}
+
+static inline void set_edac_report_status(int new)
+{
+}
+#endif
+
 static inline void opstate_init(void)
 static inline void opstate_init(void)
 {
 {
 	switch (edac_op_state) {
 	switch (edac_op_state) {

+ 5 - 5
include/trace/events/ras.h

@@ -5,7 +5,7 @@
 #define _TRACE_AER_H
 #define _TRACE_AER_H
 
 
 #include <linux/tracepoint.h>
 #include <linux/tracepoint.h>
-#include <linux/edac.h>
+#include <linux/aer.h>
 
 
 
 
 /*
 /*
@@ -63,10 +63,10 @@ TRACE_EVENT(aer_event,
 
 
 	TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
 	TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
 		__get_str(dev_name),
 		__get_str(dev_name),
-		__entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" :
-			__entry->severity == HW_EVENT_ERR_FATAL ?
-			"Fatal" : "Uncorrected",
-		__entry->severity == HW_EVENT_ERR_CORRECTED ?
+		__entry->severity == AER_CORRECTABLE ? "Corrected" :
+			__entry->severity == AER_FATAL ?
+			"Fatal" : "Uncorrected, non-fatal",
+		__entry->severity == AER_CORRECTABLE ?
 		__print_flags(__entry->status, "|", aer_correctable_errors) :
 		__print_flags(__entry->status, "|", aer_correctable_errors) :
 		__print_flags(__entry->status, "|", aer_uncorrectable_errors))
 		__print_flags(__entry->status, "|", aer_uncorrectable_errors))
 );
 );