|
@@ -0,0 +1,389 @@
|
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
|
+/*
|
|
|
+ * This file implements the error recovery as a core part of PCIe error
|
|
|
+ * reporting. When a PCIe error is delivered, an error message will be
|
|
|
+ * collected and printed to console, then, an error recovery procedure
|
|
|
+ * will be executed by following the PCI error recovery rules.
|
|
|
+ *
|
|
|
+ * Copyright (C) 2006 Intel Corp.
|
|
|
+ * Tom Long Nguyen (tom.l.nguyen@intel.com)
|
|
|
+ * Zhang Yanmin (yanmin.zhang@intel.com)
|
|
|
+ */
|
|
|
+
|
|
|
+#include <linux/pci.h>
|
|
|
+#include <linux/module.h>
|
|
|
+#include <linux/pci.h>
|
|
|
+#include <linux/kernel.h>
|
|
|
+#include <linux/errno.h>
|
|
|
+#include <linux/aer.h>
|
|
|
+#include "portdrv.h"
|
|
|
+#include "../pci.h"
|
|
|
+
|
|
|
+struct aer_broadcast_data {
|
|
|
+ enum pci_channel_state state;
|
|
|
+ enum pci_ers_result result;
|
|
|
+};
|
|
|
+
|
|
|
+static pci_ers_result_t merge_result(enum pci_ers_result orig,
|
|
|
+ enum pci_ers_result new)
|
|
|
+{
|
|
|
+ if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
|
|
|
+ return PCI_ERS_RESULT_NO_AER_DRIVER;
|
|
|
+
|
|
|
+ if (new == PCI_ERS_RESULT_NONE)
|
|
|
+ return orig;
|
|
|
+
|
|
|
+ switch (orig) {
|
|
|
+ case PCI_ERS_RESULT_CAN_RECOVER:
|
|
|
+ case PCI_ERS_RESULT_RECOVERED:
|
|
|
+ orig = new;
|
|
|
+ break;
|
|
|
+ case PCI_ERS_RESULT_DISCONNECT:
|
|
|
+ if (new == PCI_ERS_RESULT_NEED_RESET)
|
|
|
+ orig = PCI_ERS_RESULT_NEED_RESET;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ return orig;
|
|
|
+}
|
|
|
+
|
|
|
+static int report_error_detected(struct pci_dev *dev, void *data)
|
|
|
+{
|
|
|
+ pci_ers_result_t vote;
|
|
|
+ const struct pci_error_handlers *err_handler;
|
|
|
+ struct aer_broadcast_data *result_data;
|
|
|
+
|
|
|
+ result_data = (struct aer_broadcast_data *) data;
|
|
|
+
|
|
|
+ device_lock(&dev->dev);
|
|
|
+ dev->error_state = result_data->state;
|
|
|
+
|
|
|
+ if (!dev->driver ||
|
|
|
+ !dev->driver->err_handler ||
|
|
|
+ !dev->driver->err_handler->error_detected) {
|
|
|
+ if (result_data->state == pci_channel_io_frozen &&
|
|
|
+ dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
|
|
|
+ /*
|
|
|
+ * In case of fatal recovery, if one of down-
|
|
|
+ * stream device has no driver. We might be
|
|
|
+ * unable to recover because a later insmod
|
|
|
+ * of a driver for this device is unaware of
|
|
|
+ * its hw state.
|
|
|
+ */
|
|
|
+ pci_printk(KERN_DEBUG, dev, "device has %s\n",
|
|
|
+ dev->driver ?
|
|
|
+ "no AER-aware driver" : "no driver");
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If there's any device in the subtree that does not
|
|
|
+ * have an error_detected callback, returning
|
|
|
+ * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
|
|
|
+ * the subsequent mmio_enabled/slot_reset/resume
|
|
|
+ * callbacks of "any" device in the subtree. All the
|
|
|
+ * devices in the subtree are left in the error state
|
|
|
+ * without recovery.
|
|
|
+ */
|
|
|
+
|
|
|
+ if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
|
|
|
+ vote = PCI_ERS_RESULT_NO_AER_DRIVER;
|
|
|
+ else
|
|
|
+ vote = PCI_ERS_RESULT_NONE;
|
|
|
+ } else {
|
|
|
+ err_handler = dev->driver->err_handler;
|
|
|
+ vote = err_handler->error_detected(dev, result_data->state);
|
|
|
+ pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
|
|
|
+ }
|
|
|
+
|
|
|
+ result_data->result = merge_result(result_data->result, vote);
|
|
|
+ device_unlock(&dev->dev);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static int report_mmio_enabled(struct pci_dev *dev, void *data)
|
|
|
+{
|
|
|
+ pci_ers_result_t vote;
|
|
|
+ const struct pci_error_handlers *err_handler;
|
|
|
+ struct aer_broadcast_data *result_data;
|
|
|
+
|
|
|
+ result_data = (struct aer_broadcast_data *) data;
|
|
|
+
|
|
|
+ device_lock(&dev->dev);
|
|
|
+ if (!dev->driver ||
|
|
|
+ !dev->driver->err_handler ||
|
|
|
+ !dev->driver->err_handler->mmio_enabled)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ err_handler = dev->driver->err_handler;
|
|
|
+ vote = err_handler->mmio_enabled(dev);
|
|
|
+ result_data->result = merge_result(result_data->result, vote);
|
|
|
+out:
|
|
|
+ device_unlock(&dev->dev);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static int report_slot_reset(struct pci_dev *dev, void *data)
|
|
|
+{
|
|
|
+ pci_ers_result_t vote;
|
|
|
+ const struct pci_error_handlers *err_handler;
|
|
|
+ struct aer_broadcast_data *result_data;
|
|
|
+
|
|
|
+ result_data = (struct aer_broadcast_data *) data;
|
|
|
+
|
|
|
+ device_lock(&dev->dev);
|
|
|
+ if (!dev->driver ||
|
|
|
+ !dev->driver->err_handler ||
|
|
|
+ !dev->driver->err_handler->slot_reset)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ err_handler = dev->driver->err_handler;
|
|
|
+ vote = err_handler->slot_reset(dev);
|
|
|
+ result_data->result = merge_result(result_data->result, vote);
|
|
|
+out:
|
|
|
+ device_unlock(&dev->dev);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static int report_resume(struct pci_dev *dev, void *data)
|
|
|
+{
|
|
|
+ const struct pci_error_handlers *err_handler;
|
|
|
+
|
|
|
+ device_lock(&dev->dev);
|
|
|
+ dev->error_state = pci_channel_io_normal;
|
|
|
+
|
|
|
+ if (!dev->driver ||
|
|
|
+ !dev->driver->err_handler ||
|
|
|
+ !dev->driver->err_handler->resume)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ err_handler = dev->driver->err_handler;
|
|
|
+ err_handler->resume(dev);
|
|
|
+ pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
|
|
|
+out:
|
|
|
+ device_unlock(&dev->dev);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * default_reset_link - default reset function
|
|
|
+ * @dev: pointer to pci_dev data structure
|
|
|
+ *
|
|
|
+ * Invoked when performing link reset on a Downstream Port or a
|
|
|
+ * Root Port with no aer driver.
|
|
|
+ */
|
|
|
+static pci_ers_result_t default_reset_link(struct pci_dev *dev)
|
|
|
+{
|
|
|
+ pci_reset_bridge_secondary_bus(dev);
|
|
|
+ pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
|
|
|
+ return PCI_ERS_RESULT_RECOVERED;
|
|
|
+}
|
|
|
+
|
|
|
+static pci_ers_result_t reset_link(struct pci_dev *dev)
|
|
|
+{
|
|
|
+ struct pci_dev *udev;
|
|
|
+ pci_ers_result_t status;
|
|
|
+ struct pcie_port_service_driver *driver = NULL;
|
|
|
+
|
|
|
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
|
|
|
+ /* Reset this port for all subordinates */
|
|
|
+ udev = dev;
|
|
|
+ } else {
|
|
|
+ /* Reset the upstream component (likely downstream port) */
|
|
|
+ udev = dev->bus->self;
|
|
|
+ }
|
|
|
+
|
|
|
+#if IS_ENABLED(CONFIG_PCIEAER)
|
|
|
+ /* Use the aer driver of the component firstly */
|
|
|
+ driver = find_aer_service(udev);
|
|
|
+#endif
|
|
|
+
|
|
|
+ if (driver && driver->reset_link) {
|
|
|
+ status = driver->reset_link(udev);
|
|
|
+ } else if (udev->has_secondary_link) {
|
|
|
+ status = default_reset_link(udev);
|
|
|
+ } else {
|
|
|
+ pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
|
|
|
+ pci_name(udev));
|
|
|
+ return PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (status != PCI_ERS_RESULT_RECOVERED) {
|
|
|
+ pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
|
|
|
+ pci_name(udev));
|
|
|
+ return PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ }
|
|
|
+
|
|
|
+ return status;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * broadcast_error_message - handle message broadcast to downstream drivers
|
|
|
+ * @dev: pointer to from where in a hierarchy message is broadcasted down
|
|
|
+ * @state: error state
|
|
|
+ * @error_mesg: message to print
|
|
|
+ * @cb: callback to be broadcasted
|
|
|
+ *
|
|
|
+ * Invoked during error recovery process. Once being invoked, the content
|
|
|
+ * of error severity will be broadcasted to all downstream drivers in a
|
|
|
+ * hierarchy in question.
|
|
|
+ */
|
|
|
+static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
|
|
|
+ enum pci_channel_state state,
|
|
|
+ char *error_mesg,
|
|
|
+ int (*cb)(struct pci_dev *, void *))
|
|
|
+{
|
|
|
+ struct aer_broadcast_data result_data;
|
|
|
+
|
|
|
+ pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
|
|
|
+ result_data.state = state;
|
|
|
+ if (cb == report_error_detected)
|
|
|
+ result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
|
|
|
+ else
|
|
|
+ result_data.result = PCI_ERS_RESULT_RECOVERED;
|
|
|
+
|
|
|
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
|
|
|
+ /*
|
|
|
+ * If the error is reported by a bridge, we think this error
|
|
|
+ * is related to the downstream link of the bridge, so we
|
|
|
+ * do error recovery on all subordinates of the bridge instead
|
|
|
+ * of the bridge and clear the error status of the bridge.
|
|
|
+ */
|
|
|
+ if (cb == report_error_detected)
|
|
|
+ dev->error_state = state;
|
|
|
+ pci_walk_bus(dev->subordinate, cb, &result_data);
|
|
|
+ if (cb == report_resume) {
|
|
|
+ pci_cleanup_aer_uncorrect_error_status(dev);
|
|
|
+ dev->error_state = pci_channel_io_normal;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * If the error is reported by an end point, we think this
|
|
|
+ * error is related to the upstream link of the end point.
|
|
|
+ */
|
|
|
+ if (state == pci_channel_io_normal)
|
|
|
+ /*
|
|
|
+ * the error is non fatal so the bus is ok, just invoke
|
|
|
+ * the callback for the function that logged the error.
|
|
|
+ */
|
|
|
+ cb(dev, &result_data);
|
|
|
+ else
|
|
|
+ pci_walk_bus(dev->bus, cb, &result_data);
|
|
|
+ }
|
|
|
+
|
|
|
+ return result_data.result;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * pcie_do_fatal_recovery - handle fatal error recovery process
|
|
|
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
|
|
|
+ *
|
|
|
+ * Invoked when an error is fatal. Once being invoked, removes the devices
|
|
|
+ * beneath this AER agent, followed by reset link e.g. secondary bus reset
|
|
|
+ * followed by re-enumeration of devices.
|
|
|
+ */
|
|
|
+void pcie_do_fatal_recovery(struct pci_dev *dev)
|
|
|
+{
|
|
|
+ struct pci_dev *udev;
|
|
|
+ struct pci_bus *parent;
|
|
|
+ struct pci_dev *pdev, *temp;
|
|
|
+ pci_ers_result_t result;
|
|
|
+
|
|
|
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
|
|
|
+ udev = dev;
|
|
|
+ else
|
|
|
+ udev = dev->bus->self;
|
|
|
+
|
|
|
+ parent = udev->subordinate;
|
|
|
+ pci_lock_rescan_remove();
|
|
|
+ list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
|
|
|
+ bus_list) {
|
|
|
+ pci_dev_get(pdev);
|
|
|
+ pci_dev_set_disconnected(pdev, NULL);
|
|
|
+ if (pci_has_subordinate(pdev))
|
|
|
+ pci_walk_bus(pdev->subordinate,
|
|
|
+ pci_dev_set_disconnected, NULL);
|
|
|
+ pci_stop_and_remove_bus_device(pdev);
|
|
|
+ pci_dev_put(pdev);
|
|
|
+ }
|
|
|
+
|
|
|
+ result = reset_link(udev);
|
|
|
+
|
|
|
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
|
|
|
+ /*
|
|
|
+ * If the error is reported by a bridge, we think this error
|
|
|
+ * is related to the downstream link of the bridge, so we
|
|
|
+ * do error recovery on all subordinates of the bridge instead
|
|
|
+ * of the bridge and clear the error status of the bridge.
|
|
|
+ */
|
|
|
+ pci_cleanup_aer_uncorrect_error_status(dev);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (result == PCI_ERS_RESULT_RECOVERED) {
|
|
|
+ if (pcie_wait_for_link(udev, true))
|
|
|
+ pci_rescan_bus(udev->bus);
|
|
|
+ pci_info(dev, "Device recovery from fatal error successful\n");
|
|
|
+ } else {
|
|
|
+ pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
|
|
|
+ pci_info(dev, "Device recovery from fatal error failed\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ pci_unlock_rescan_remove();
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
|
|
|
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
|
|
|
+ *
|
|
|
+ * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
|
|
|
+ * error detected message to all downstream drivers within a hierarchy in
|
|
|
+ * question and return the returned code.
|
|
|
+ */
|
|
|
+void pcie_do_nonfatal_recovery(struct pci_dev *dev)
|
|
|
+{
|
|
|
+ pci_ers_result_t status;
|
|
|
+ enum pci_channel_state state;
|
|
|
+
|
|
|
+ state = pci_channel_io_normal;
|
|
|
+
|
|
|
+ status = broadcast_error_message(dev,
|
|
|
+ state,
|
|
|
+ "error_detected",
|
|
|
+ report_error_detected);
|
|
|
+
|
|
|
+ if (status == PCI_ERS_RESULT_CAN_RECOVER)
|
|
|
+ status = broadcast_error_message(dev,
|
|
|
+ state,
|
|
|
+ "mmio_enabled",
|
|
|
+ report_mmio_enabled);
|
|
|
+
|
|
|
+ if (status == PCI_ERS_RESULT_NEED_RESET) {
|
|
|
+ /*
|
|
|
+ * TODO: Should call platform-specific
|
|
|
+ * functions to reset slot before calling
|
|
|
+ * drivers' slot_reset callbacks?
|
|
|
+ */
|
|
|
+ status = broadcast_error_message(dev,
|
|
|
+ state,
|
|
|
+ "slot_reset",
|
|
|
+ report_slot_reset);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (status != PCI_ERS_RESULT_RECOVERED)
|
|
|
+ goto failed;
|
|
|
+
|
|
|
+ broadcast_error_message(dev,
|
|
|
+ state,
|
|
|
+ "resume",
|
|
|
+ report_resume);
|
|
|
+
|
|
|
+ pci_info(dev, "AER: Device recovery successful\n");
|
|
|
+ return;
|
|
|
+
|
|
|
+failed:
|
|
|
+ pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
|
|
|
+
|
|
|
+ /* TODO: Should kernel panic here? */
|
|
|
+ pci_info(dev, "AER: Device recovery failed\n");
|
|
|
+}
|