Browse Source

EDAC: Expose per-DIMM error counts in sysfs

The old csrowX sysfs directories have per-csrow error counters, but the
new dimmX directories do not currently expose error counts.

EDAC already keeps these counts, add them to sysfs so per-DIMM counts
are still available when CONFIG_EDAC_LEGACY_SYSFS=n.

Signed-off-by: Aaron Miller <aaronmiller@fb.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com
Signed-off-by: Borislav Petkov <bp@suse.de>
Aaron Miller 8 years ago
parent
commit
4fb6fde74d

+ 17 - 0
Documentation/ABI/testing/sysfs-devices-edac

@@ -138,3 +138,20 @@ Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
 Description:	This attribute file will display what type of memory is
 Description:	This attribute file will display what type of memory is
 		currently on this csrow. Normally, either buffered or
 		currently on this csrow. Normally, either buffered or
 		unbuffered memory (for example, Unbuffered-DDR3).
 		unbuffered memory (for example, Unbuffered-DDR3).
+
+What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
+Date:		October 2016
+Contact:	linux-edac@vger.kernel.org
+Description:	This attribute file displays the total count of correctable
+		errors that have occurred on this DIMM. This count is very important
+		to examine. CEs provide early indications that a DIMM is beginning
+		to fail. This count field should be monitored for non-zero values
+		and report such information to the system administrator.
+
+What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
+Date:		October 2016
+Contact:	linux-edac@vger.kernel.org
+Description:	This attribute file displays the total count of uncorrectable
+		errors that have occurred on this DIMM. If panic_on_ue is set, this
+		counter will not have a chance to increment, since EDAC will panic the
+		system

+ 20 - 0
Documentation/admin-guide/ras.rst

@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
 	│   │   ├── ce_count
 	│   │   ├── ce_count
 	│   │   ├── ce_noinfo_count
 	│   │   ├── ce_noinfo_count
 	│   │   ├── dimm0
 	│   │   ├── dimm0
+	│   │   │   ├── dimm_ce_count
 	│   │   │   ├── dimm_dev_type
 	│   │   │   ├── dimm_dev_type
 	│   │   │   ├── dimm_edac_mode
 	│   │   │   ├── dimm_edac_mode
 	│   │   │   ├── dimm_label
 	│   │   │   ├── dimm_label
 	│   │   │   ├── dimm_location
 	│   │   │   ├── dimm_location
 	│   │   │   ├── dimm_mem_type
 	│   │   │   ├── dimm_mem_type
+	│   │   │   ├── dimm_ue_count
 	│   │   │   ├── size
 	│   │   │   ├── size
 	│   │   │   └── uevent
 	│   │   │   └── uevent
 	│   │   ├── max_location
 	│   │   ├── max_location
@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
 	│   │   ├── ce_count
 	│   │   ├── ce_count
 	│   │   ├── ce_noinfo_count
 	│   │   ├── ce_noinfo_count
 	│   │   ├── dimm0
 	│   │   ├── dimm0
+	│   │   │   ├── dimm_ce_count
 	│   │   │   ├── dimm_dev_type
 	│   │   │   ├── dimm_dev_type
 	│   │   │   ├── dimm_edac_mode
 	│   │   │   ├── dimm_edac_mode
 	│   │   │   ├── dimm_label
 	│   │   │   ├── dimm_label
 	│   │   │   ├── dimm_location
 	│   │   │   ├── dimm_location
 	│   │   │   ├── dimm_mem_type
 	│   │   │   ├── dimm_mem_type
+	│   │   │   ├── dimm_ue_count
 	│   │   │   ├── size
 	│   │   │   ├── size
 	│   │   │   └── uevent
 	│   │   │   └── uevent
 	│   │   ├── max_location
 	│   │   ├── max_location
@@ -483,6 +487,22 @@ this ``X`` memory module:
 	This attribute file displays, in count of megabytes, the memory
 	This attribute file displays, in count of megabytes, the memory
 	that this csrow contains.
 	that this csrow contains.
 
 
+- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
+
+	This attribute file displays the total count of uncorrectable
+	errors that have occurred on this DIMM. If panic_on_ue is set
+	this counter will not have a chance to increment, since EDAC
+	will panic the system.
+
+- ``dimm_ce_count`` - Correctable Errors count attribute file
+
+	This attribute file displays the total count of correctable
+	errors that have occurred on this DIMM. This count is very
+	important to examine. CEs provide early indications that a
+	DIMM is beginning to fail. This count field should be
+	monitored for non-zero values and report such information
+	to the system administrator.
+
 - ``dimm_dev_type``  - Device type attribute file
 - ``dimm_dev_type``  - Device type attribute file
 
 
 	This attribute file will display what type of DRAM device is
 	This attribute file will display what type of DRAM device is

+ 38 - 0
drivers/edac/edac_mc_sysfs.c

@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
 	return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
 	return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
 }
 }
 
 
+static ssize_t dimmdev_ce_count_show(struct device *dev,
+				      struct device_attribute *mattr,
+				      char *data)
+{
+	struct dimm_info *dimm = to_dimm(dev);
+	u32 count;
+	int off;
+
+	off = EDAC_DIMM_OFF(dimm->mci->layers,
+			    dimm->mci->n_layers,
+			    dimm->location[0],
+			    dimm->location[1],
+			    dimm->location[2]);
+	count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
+	return sprintf(data, "%u\n", count);
+}
+
+static ssize_t dimmdev_ue_count_show(struct device *dev,
+				      struct device_attribute *mattr,
+				      char *data)
+{
+	struct dimm_info *dimm = to_dimm(dev);
+	u32 count;
+	int off;
+
+	off = EDAC_DIMM_OFF(dimm->mci->layers,
+			    dimm->mci->n_layers,
+			    dimm->location[0],
+			    dimm->location[1],
+			    dimm->location[2]);
+	count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
+	return sprintf(data, "%u\n", count);
+}
+
 /* dimm/rank attribute files */
 /* dimm/rank attribute files */
 static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
 static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
 		   dimmdev_label_show, dimmdev_label_store);
 		   dimmdev_label_show, dimmdev_label_store);
@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
 static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
 static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
 static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
 static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
 static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
 static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
+static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
+static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
 
 
 /* attributes of the dimm<id>/rank<id> object */
 /* attributes of the dimm<id>/rank<id> object */
 static struct attribute *dimm_attrs[] = {
 static struct attribute *dimm_attrs[] = {
@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
 	&dev_attr_dimm_mem_type.attr,
 	&dev_attr_dimm_mem_type.attr,
 	&dev_attr_dimm_dev_type.attr,
 	&dev_attr_dimm_dev_type.attr,
 	&dev_attr_dimm_edac_mode.attr,
 	&dev_attr_dimm_edac_mode.attr,
+	&dev_attr_dimm_ce_count.attr,
+	&dev_attr_dimm_ue_count.attr,
 	NULL,
 	NULL,
 };
 };