Browse Source

PCI/AER: Add sysfs attributes to provide AER stats and breakdown

Add sysfs attributes to provide total and breakdown of the AERs seen,
into different type of correctable, fatal and nonfatal errors:

  /sys/bus/pci/devices/<dev>/aer_dev_correctable
  /sys/bus/pci/devices/<dev>/aer_dev_fatal
  /sys/bus/pci/devices/<dev>/aer_dev_nonfatal

Signed-off-by: Rajat Jain <rajatja@google.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Rajat Jain 7 years ago
parent
commit
81aa5206f9

+ 94 - 0
Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats

@@ -0,0 +1,94 @@
+==========================
+PCIe Device AER statistics
+==========================
+These attributes show up under all the devices that are AER capable. These
+statistical counters indicate the errors "as seen/reported by the device".
+Note that this may mean that if an endpoint is causing problems, the AER
+counters may increment at its link partner (e.g. root port) because the
+errors may be "seen" / reported by the link partner and not the
+problematic endpoint itself (which may report all counters as 0 as it never
+saw any problems).
+
+Where:		/sys/bus/pci/devices/<dev>/aer_dev_correctable
+Date:		July 2018
+Kernel Version: 4.19.0
+Contact:	linux-pci@vger.kernel.org, rajatja@google.com
+Description:	List of correctable errors seen and reported by this
+		PCI device using ERR_COR. Note that since multiple errors may
+		be reported using a single ERR_COR message, thus
+		TOTAL_ERR_COR at the end of the file may not match the actual
+		total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_correctable
+Receiver Error 2
+Bad TLP 0
+Bad DLLP 0
+RELAY_NUM Rollover 0
+Replay Timer Timeout 0
+Advisory Non-Fatal 0
+Corrected Internal Error 0
+Header Log Overflow 0
+TOTAL_ERR_COR 2
+-------------------------------------------------------------------------
+
+Where:		/sys/bus/pci/devices/<dev>/aer_dev_fatal
+Date:		July 2018
+Kernel Version: 4.19.0
+Contact:	linux-pci@vger.kernel.org, rajatja@google.com
+Description:	List of uncorrectable fatal errors seen and reported by this
+		PCI device using ERR_FATAL. Note that since multiple errors may
+		be reported using a single ERR_FATAL message, thus
+		TOTAL_ERR_FATAL at the end of the file may not match the actual
+		total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_fatal
+Undefined 0
+Data Link Protocol 0
+Surprise Down Error 0
+Poisoned TLP 0
+Flow Control Protocol 0
+Completion Timeout 0
+Completer Abort 0
+Unexpected Completion 0
+Receiver Overflow 0
+Malformed TLP 0
+ECRC 0
+Unsupported Request 0
+ACS Violation 0
+Uncorrectable Internal Error 0
+MC Blocked TLP 0
+AtomicOp Egress Blocked 0
+TLP Prefix Blocked Error 0
+TOTAL_ERR_FATAL 0
+-------------------------------------------------------------------------
+
+Where:		/sys/bus/pci/devices/<dev>/aer_dev_nonfatal
+Date:		July 2018
+Kernel Version: 4.19.0
+Contact:	linux-pci@vger.kernel.org, rajatja@google.com
+Description:	List of uncorrectable nonfatal errors seen and reported by this
+		PCI device using ERR_NONFATAL. Note that since multiple errors
+		may be reported using a single ERR_FATAL message, thus
+		TOTAL_ERR_NONFATAL at the end of the file may not match the
+		actual total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_nonfatal
+Undefined 0
+Data Link Protocol 0
+Surprise Down Error 0
+Poisoned TLP 0
+Flow Control Protocol 0
+Completion Timeout 0
+Completer Abort 0
+Unexpected Completion 0
+Receiver Overflow 0
+Malformed TLP 0
+ECRC 0
+Unsupported Request 0
+ACS Violation 0
+Uncorrectable Internal Error 0
+MC Blocked TLP 0
+AtomicOp Egress Blocked 0
+TLP Prefix Blocked Error 0
+TOTAL_ERR_NONFATAL 0
+-------------------------------------------------------------------------

+ 5 - 0
Documentation/PCI/pcieaer-howto.txt

@@ -73,6 +73,11 @@ In the example, 'Requester ID' means the ID of the device who sends
 the error message to root port. Pls. refer to pci express specs for
 the error message to root port. Pls. refer to pci express specs for
 other fields.
 other fields.
 
 
+2.4 AER Statistics / Counters
+
+When PCIe AER errors are captured, the counters / statistics are also exposed
+in the form of sysfs attributes which are documented at
+Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
 
 
 3. Developer Guide
 3. Developer Guide
 
 

+ 3 - 0
drivers/pci/pci-sysfs.c

@@ -1746,6 +1746,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 #endif
 	&pci_bridge_attr_group,
 	&pci_bridge_attr_group,
 	&pcie_dev_attr_group,
 	&pcie_dev_attr_group,
+#ifdef CONFIG_PCIEAER
+	&aer_stats_attr_group,
+#endif
 	NULL,
 	NULL,
 };
 };
 
 

+ 1 - 0
drivers/pci/pci.h

@@ -484,6 +484,7 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
 void pci_no_aer(void);
 void pci_no_aer(void);
 void pci_aer_init(struct pci_dev *dev);
 void pci_aer_init(struct pci_dev *dev);
 void pci_aer_exit(struct pci_dev *dev);
 void pci_aer_exit(struct pci_dev *dev);
+extern const struct attribute_group aer_stats_attr_group;
 #else
 #else
 static inline void pci_no_aer(void) { }
 static inline void pci_no_aer(void) { }
 static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
 static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }

+ 94 - 0
drivers/pci/pcie/aer.c

@@ -542,6 +542,99 @@ static const char *aer_agent_string[] = {
 	"Transmitter ID"
 	"Transmitter ID"
 };
 };
 
 
+#define aer_stats_dev_attr(name, stats_array, strings_array,		\
+			   total_string, total_field)			\
+	static ssize_t							\
+	name##_show(struct device *dev, struct device_attribute *attr,	\
+		     char *buf)						\
+{									\
+	unsigned int i;							\
+	char *str = buf;						\
+	struct pci_dev *pdev = to_pci_dev(dev);				\
+	u64 *stats = pdev->aer_stats->stats_array;			\
+									\
+	for (i = 0; i < ARRAY_SIZE(strings_array); i++) {		\
+		if (strings_array[i])					\
+			str += sprintf(str, "%s %llu\n",		\
+				       strings_array[i], stats[i]);	\
+		else if (stats[i])					\
+			str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
+				       i, stats[i]);			\
+	}								\
+	str += sprintf(str, "TOTAL_%s %llu\n", total_string,		\
+		       pdev->aer_stats->total_field);			\
+	return str-buf;							\
+}									\
+static DEVICE_ATTR_RO(name)
+
+aer_stats_dev_attr(aer_dev_correctable, dev_cor_errs,
+		   aer_correctable_error_string, "ERR_COR",
+		   dev_total_cor_errs);
+aer_stats_dev_attr(aer_dev_fatal, dev_fatal_errs,
+		   aer_uncorrectable_error_string, "ERR_FATAL",
+		   dev_total_fatal_errs);
+aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
+		   aer_uncorrectable_error_string, "ERR_NONFATAL",
+		   dev_total_nonfatal_errs);
+
+static struct attribute *aer_stats_attrs[] __ro_after_init = {
+	&dev_attr_aer_dev_correctable.attr,
+	&dev_attr_aer_dev_fatal.attr,
+	&dev_attr_aer_dev_nonfatal.attr,
+	NULL
+};
+
+static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
+					   struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (!pdev->aer_stats)
+		return 0;
+
+	return a->mode;
+}
+
+const struct attribute_group aer_stats_attr_group = {
+	.attrs  = aer_stats_attrs,
+	.is_visible = aer_stats_attrs_are_visible,
+};
+
+static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
+				   struct aer_err_info *info)
+{
+	int status, i, max = -1;
+	u64 *counter = NULL;
+	struct aer_stats *aer_stats = pdev->aer_stats;
+
+	if (!aer_stats)
+		return;
+
+	switch (info->severity) {
+	case AER_CORRECTABLE:
+		aer_stats->dev_total_cor_errs++;
+		counter = &aer_stats->dev_cor_errs[0];
+		max = AER_MAX_TYPEOF_COR_ERRS;
+		break;
+	case AER_NONFATAL:
+		aer_stats->dev_total_nonfatal_errs++;
+		counter = &aer_stats->dev_nonfatal_errs[0];
+		max = AER_MAX_TYPEOF_UNCOR_ERRS;
+		break;
+	case AER_FATAL:
+		aer_stats->dev_total_fatal_errs++;
+		counter = &aer_stats->dev_fatal_errs[0];
+		max = AER_MAX_TYPEOF_UNCOR_ERRS;
+		break;
+	}
+
+	status = (info->status & ~info->mask);
+	for (i = 0; i < max; i++)
+		if (status & (1 << i))
+			counter[i]++;
+}
+
 static void __print_tlp_header(struct pci_dev *dev,
 static void __print_tlp_header(struct pci_dev *dev,
 			       struct aer_header_log_regs *t)
 			       struct aer_header_log_regs *t)
 {
 {
@@ -574,6 +667,7 @@ static void __aer_print_error(struct pci_dev *dev,
 			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
 			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
 				i, info->first_error == i ? " (First)" : "");
 				i, info->first_error == i ? " (First)" : "");
 	}
 	}
+	pci_dev_aer_stats_incr(dev, info);
 }
 }
 
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)