Browse Source

trace, RAS: Add eMCA trace event interface

Add trace interface to elaborate all H/W error related information.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Chen, Gong 11 years ago
parent
commit
2dfb7d51a6

+ 3 - 1
drivers/acpi/Kconfig

@@ -370,6 +370,7 @@ config ACPI_EXTLOG
 	tristate "Extended Error Log support"
 	tristate "Extended Error Log support"
 	depends on X86_MCE && X86_LOCAL_APIC
 	depends on X86_MCE && X86_LOCAL_APIC
 	select UEFI_CPER
 	select UEFI_CPER
+	select RAS
 	default n
 	default n
 	help
 	help
 	  Certain usages such as Predictive Failure Analysis (PFA) require
 	  Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@ config ACPI_EXTLOG
 
 
 	  Enhanced MCA Logging allows firmware to provide additional error
 	  Enhanced MCA Logging allows firmware to provide additional error
 	  information to system software, synchronous with MCE or CMCI. This
 	  information to system software, synchronous with MCE or CMCI. This
-	  driver adds support for that functionality.
+	  driver adds support for that functionality with corresponding
+	  tracepoint which carries that information to userspace.
 
 
 endif	# ACPI
 endif	# ACPI

+ 24 - 3
drivers/acpi/acpi_extlog.c

@@ -16,6 +16,7 @@
 #include <asm/mce.h>
 #include <asm/mce.h>
 
 
 #include "apei/apei-internal.h"
 #include "apei/apei-internal.h"
+#include <ras/ras_event.h>
 
 
 #define EXT_ELOG_ENTRY_MASK	GENMASK_ULL(51, 0) /* elog entry address mask */
 #define EXT_ELOG_ENTRY_MASK	GENMASK_ULL(51, 0) /* elog entry address mask */
 
 
@@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
 	struct mce *mce = (struct mce *)data;
 	struct mce *mce = (struct mce *)data;
 	int	bank = mce->bank;
 	int	bank = mce->bank;
 	int	cpu = mce->extcpu;
 	int	cpu = mce->extcpu;
-	struct acpi_generic_status *estatus;
-	int rc;
+	struct acpi_generic_status *estatus, *tmp;
+	struct acpi_generic_data *gdata;
+	const uuid_le *fru_id = &NULL_UUID_LE;
+	char *fru_text = "";
+	uuid_le *sec_type;
+	static u32 err_seq;
 
 
 	estatus = extlog_elog_entry_check(cpu, bank);
 	estatus = extlog_elog_entry_check(cpu, bank);
 	if (estatus == NULL)
 	if (estatus == NULL)
@@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
 	/* clear record status to enable BIOS to update it again */
 	/* clear record status to enable BIOS to update it again */
 	estatus->block_status = 0;
 	estatus->block_status = 0;
 
 
-	rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
+	tmp = (struct acpi_generic_status *)elog_buf;
+	print_extlog_rcd(NULL, tmp, cpu);
+
+	/* log event via trace */
+	err_seq++;
+	gdata = (struct acpi_generic_data *)(tmp + 1);
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+		fru_id = (uuid_le *)gdata->fru_id;
+	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+		fru_text = gdata->fru_text;
+	sec_type = (uuid_le *)gdata->section_type;
+	if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+		struct cper_sec_mem_err *mem = (void *)(gdata + 1);
+		if (gdata->error_data_length >= sizeof(*mem))
+			trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
+					       (u8)gdata->error_severity);
+	}
 
 
 	return NOTIFY_STOP;
 	return NOTIFY_STOP;
 }
 }

+ 41 - 4
drivers/firmware/efi/cper.c

@@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype)
 }
 }
 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
 
 
-static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
 {
 	u32 len, n;
 	u32 len, n;
 
 
@@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
 	return n;
 	return n;
 }
 }
 
 
-static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
 {
 	u32 len, n;
 	u32 len, n;
 	const char *bank = NULL, *device = NULL;
 	const char *bank = NULL, *device = NULL;
@@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
 	return n;
 	return n;
 }
 }
 
 
+void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
+		       struct cper_mem_err_compact *cmem)
+{
+	cmem->validation_bits = mem->validation_bits;
+	cmem->node = mem->node;
+	cmem->card = mem->card;
+	cmem->module = mem->module;
+	cmem->bank = mem->bank;
+	cmem->device = mem->device;
+	cmem->row = mem->row;
+	cmem->column = mem->column;
+	cmem->bit_pos = mem->bit_pos;
+	cmem->requestor_id = mem->requestor_id;
+	cmem->responder_id = mem->responder_id;
+	cmem->target_id = mem->target_id;
+	cmem->rank = mem->rank;
+	cmem->mem_array_handle = mem->mem_array_handle;
+	cmem->mem_dev_handle = mem->mem_dev_handle;
+}
+
+const char *cper_mem_err_unpack(struct trace_seq *p,
+				struct cper_mem_err_compact *cmem)
+{
+	const char *ret = p->buffer + p->len;
+
+	if (cper_mem_err_location(cmem, rcd_decode_str))
+		trace_seq_printf(p, "%s", rcd_decode_str);
+	if (cper_dimm_err_location(cmem, rcd_decode_str))
+		trace_seq_printf(p, "%s", rcd_decode_str);
+	trace_seq_putc(p, '\0');
+
+	return ret;
+}
+
 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 {
 {
+	struct cper_mem_err_compact cmem;
+
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
 	if (mem->validation_bits & CPER_MEM_VALID_PA)
 	if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
 	if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
 		printk("%s""physical_address_mask: 0x%016llx\n",
 		printk("%s""physical_address_mask: 0x%016llx\n",
 		       pfx, mem->physical_addr_mask);
 		       pfx, mem->physical_addr_mask);
-	if (cper_mem_err_location(mem, rcd_decode_str))
+	cper_mem_err_pack(mem, &cmem);
+	if (cper_mem_err_location(&cmem, rcd_decode_str))
 		printk("%s%s\n", pfx, rcd_decode_str);
 		printk("%s%s\n", pfx, rcd_decode_str);
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 		u8 etype = mem->error_type;
 		u8 etype = mem->error_type;
 		printk("%s""error_type: %d, %s\n", pfx, etype,
 		printk("%s""error_type: %d, %s\n", pfx, etype,
 		       cper_mem_err_type_str(etype));
 		       cper_mem_err_type_str(etype));
 	}
 	}
-	if (cper_dimm_err_location(mem, rcd_decode_str))
+	if (cper_dimm_err_location(&cmem, rcd_decode_str))
 		printk("%s%s\n", pfx, rcd_decode_str);
 		printk("%s%s\n", pfx, rcd_decode_str);
 }
 }
 
 

+ 3 - 0
drivers/ras/ras.c

@@ -23,4 +23,7 @@ static int __init ras_init(void)
 }
 }
 subsys_initcall(ras_init);
 subsys_initcall(ras_init);
 
 
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
+#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);

+ 23 - 0
include/linux/cper.h

@@ -22,6 +22,7 @@
 #define LINUX_CPER_H
 #define LINUX_CPER_H
 
 
 #include <linux/uuid.h>
 #include <linux/uuid.h>
+#include <linux/trace_seq.h>
 
 
 /* CPER record signature and the size */
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD				"CPER"
 #define CPER_SIG_RECORD				"CPER"
@@ -363,6 +364,24 @@ struct cper_sec_mem_err {
 	__u16	mem_dev_handle;		/* module handle in UEFI 2.4 */
 	__u16	mem_dev_handle;		/* module handle in UEFI 2.4 */
 };
 };
 
 
+struct cper_mem_err_compact {
+	__u64	validation_bits;
+	__u16	node;
+	__u16	card;
+	__u16	module;
+	__u16	bank;
+	__u16	device;
+	__u16	row;
+	__u16	column;
+	__u16	bit_pos;
+	__u64	requestor_id;
+	__u64	responder_id;
+	__u64	target_id;
+	__u16	rank;
+	__u16	mem_array_handle;
+	__u16	mem_dev_handle;
+};
+
 struct cper_sec_pcie {
 struct cper_sec_pcie {
 	__u64		validation_bits;
 	__u64		validation_bits;
 	__u32		port_type;
 	__u32		port_type;
@@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int);
 const char *cper_mem_err_type_str(unsigned int);
 const char *cper_mem_err_type_str(unsigned int);
 void cper_print_bits(const char *prefix, unsigned int bits,
 void cper_print_bits(const char *prefix, unsigned int bits,
 		     const char * const strs[], unsigned int strs_size);
 		     const char * const strs[], unsigned int strs_size);
+void cper_mem_err_pack(const struct cper_sec_mem_err *,
+		       struct cper_mem_err_compact *);
+const char *cper_mem_err_unpack(struct trace_seq *,
+				struct cper_mem_err_compact *);
 
 
 #endif
 #endif

+ 64 - 0
include/ras/ras_event.h

@@ -9,6 +9,70 @@
 #include <linux/edac.h>
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
 #include <linux/aer.h>
+#include <linux/cper.h>
+
+/*
+ * MCE Extended Error Log trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event.
+ */
+
+/* memory trace event */
+
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+TRACE_EVENT(extlog_mem_event,
+	TP_PROTO(struct cper_sec_mem_err *mem,
+		 u32 err_seq,
+		 const uuid_le *fru_id,
+		 const char *fru_text,
+		 u8 sev),
+
+	TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
+
+	TP_STRUCT__entry(
+		__field(u32, err_seq)
+		__field(u8, etype)
+		__field(u8, sev)
+		__field(u64, pa)
+		__field(u8, pa_mask_lsb)
+		__field_struct(uuid_le, fru_id)
+		__string(fru_text, fru_text)
+		__field_struct(struct cper_mem_err_compact, data)
+	),
+
+	TP_fast_assign(
+		__entry->err_seq = err_seq;
+		if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+			__entry->etype = mem->error_type;
+		else
+			__entry->etype = ~0;
+		__entry->sev = sev;
+		if (mem->validation_bits & CPER_MEM_VALID_PA)
+			__entry->pa = mem->physical_addr;
+		else
+			__entry->pa = ~0ull;
+
+		if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
+			__entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
+		else
+			__entry->pa_mask_lsb = ~0;
+		__entry->fru_id = *fru_id;
+		__assign_str(fru_text, fru_text);
+		cper_mem_err_pack(mem, &__entry->data);
+	),
+
+	TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
+		  __entry->err_seq,
+		  cper_severity_str(__entry->sev),
+		  cper_mem_err_type_str(__entry->etype),
+		  __entry->pa,
+		  __entry->pa_mask_lsb,
+		  cper_mem_err_unpack(p, &__entry->data),
+		  &__entry->fru_id,
+		  __get_str(fru_text))
+);
+#endif
 
 
 /*
 /*
  * Hardware Events Report
  * Hardware Events Report