err.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * This file implements the error recovery as a core part of PCIe error
  4. * reporting. When a PCIe error is delivered, an error message will be
  5. * collected and printed to console, then, an error recovery procedure
  6. * will be executed by following the PCI error recovery rules.
  7. *
  8. * Copyright (C) 2006 Intel Corp.
  9. * Tom Long Nguyen (tom.l.nguyen@intel.com)
  10. * Zhang Yanmin (yanmin.zhang@intel.com)
  11. */
  12. #include <linux/pci.h>
  13. #include <linux/module.h>
  14. #include <linux/pci.h>
  15. #include <linux/kernel.h>
  16. #include <linux/errno.h>
  17. #include <linux/aer.h>
  18. #include "portdrv.h"
  19. #include "../pci.h"
  20. struct aer_broadcast_data {
  21. enum pci_channel_state state;
  22. enum pci_ers_result result;
  23. };
  24. static pci_ers_result_t merge_result(enum pci_ers_result orig,
  25. enum pci_ers_result new)
  26. {
  27. if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
  28. return PCI_ERS_RESULT_NO_AER_DRIVER;
  29. if (new == PCI_ERS_RESULT_NONE)
  30. return orig;
  31. switch (orig) {
  32. case PCI_ERS_RESULT_CAN_RECOVER:
  33. case PCI_ERS_RESULT_RECOVERED:
  34. orig = new;
  35. break;
  36. case PCI_ERS_RESULT_DISCONNECT:
  37. if (new == PCI_ERS_RESULT_NEED_RESET)
  38. orig = PCI_ERS_RESULT_NEED_RESET;
  39. break;
  40. default:
  41. break;
  42. }
  43. return orig;
  44. }
  45. static int report_error_detected(struct pci_dev *dev, void *data)
  46. {
  47. pci_ers_result_t vote;
  48. const struct pci_error_handlers *err_handler;
  49. struct aer_broadcast_data *result_data;
  50. result_data = (struct aer_broadcast_data *) data;
  51. device_lock(&dev->dev);
  52. dev->error_state = result_data->state;
  53. if (!dev->driver ||
  54. !dev->driver->err_handler ||
  55. !dev->driver->err_handler->error_detected) {
  56. if (result_data->state == pci_channel_io_frozen &&
  57. dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
  58. /*
  59. * In case of fatal recovery, if one of down-
  60. * stream device has no driver. We might be
  61. * unable to recover because a later insmod
  62. * of a driver for this device is unaware of
  63. * its hw state.
  64. */
  65. pci_printk(KERN_DEBUG, dev, "device has %s\n",
  66. dev->driver ?
  67. "no AER-aware driver" : "no driver");
  68. }
  69. /*
  70. * If there's any device in the subtree that does not
  71. * have an error_detected callback, returning
  72. * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
  73. * the subsequent mmio_enabled/slot_reset/resume
  74. * callbacks of "any" device in the subtree. All the
  75. * devices in the subtree are left in the error state
  76. * without recovery.
  77. */
  78. if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
  79. vote = PCI_ERS_RESULT_NO_AER_DRIVER;
  80. else
  81. vote = PCI_ERS_RESULT_NONE;
  82. } else {
  83. err_handler = dev->driver->err_handler;
  84. vote = err_handler->error_detected(dev, result_data->state);
  85. pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
  86. }
  87. result_data->result = merge_result(result_data->result, vote);
  88. device_unlock(&dev->dev);
  89. return 0;
  90. }
  91. static int report_mmio_enabled(struct pci_dev *dev, void *data)
  92. {
  93. pci_ers_result_t vote;
  94. const struct pci_error_handlers *err_handler;
  95. struct aer_broadcast_data *result_data;
  96. result_data = (struct aer_broadcast_data *) data;
  97. device_lock(&dev->dev);
  98. if (!dev->driver ||
  99. !dev->driver->err_handler ||
  100. !dev->driver->err_handler->mmio_enabled)
  101. goto out;
  102. err_handler = dev->driver->err_handler;
  103. vote = err_handler->mmio_enabled(dev);
  104. result_data->result = merge_result(result_data->result, vote);
  105. out:
  106. device_unlock(&dev->dev);
  107. return 0;
  108. }
  109. static int report_slot_reset(struct pci_dev *dev, void *data)
  110. {
  111. pci_ers_result_t vote;
  112. const struct pci_error_handlers *err_handler;
  113. struct aer_broadcast_data *result_data;
  114. result_data = (struct aer_broadcast_data *) data;
  115. device_lock(&dev->dev);
  116. if (!dev->driver ||
  117. !dev->driver->err_handler ||
  118. !dev->driver->err_handler->slot_reset)
  119. goto out;
  120. err_handler = dev->driver->err_handler;
  121. vote = err_handler->slot_reset(dev);
  122. result_data->result = merge_result(result_data->result, vote);
  123. out:
  124. device_unlock(&dev->dev);
  125. return 0;
  126. }
  127. static int report_resume(struct pci_dev *dev, void *data)
  128. {
  129. const struct pci_error_handlers *err_handler;
  130. device_lock(&dev->dev);
  131. dev->error_state = pci_channel_io_normal;
  132. if (!dev->driver ||
  133. !dev->driver->err_handler ||
  134. !dev->driver->err_handler->resume)
  135. goto out;
  136. err_handler = dev->driver->err_handler;
  137. err_handler->resume(dev);
  138. pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
  139. out:
  140. device_unlock(&dev->dev);
  141. return 0;
  142. }
  143. /**
  144. * default_reset_link - default reset function
  145. * @dev: pointer to pci_dev data structure
  146. *
  147. * Invoked when performing link reset on a Downstream Port or a
  148. * Root Port with no aer driver.
  149. */
  150. static pci_ers_result_t default_reset_link(struct pci_dev *dev)
  151. {
  152. pci_reset_bridge_secondary_bus(dev);
  153. pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
  154. return PCI_ERS_RESULT_RECOVERED;
  155. }
  156. static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
  157. {
  158. struct pci_dev *udev;
  159. pci_ers_result_t status;
  160. struct pcie_port_service_driver *driver = NULL;
  161. if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
  162. /* Reset this port for all subordinates */
  163. udev = dev;
  164. } else {
  165. /* Reset the upstream component (likely downstream port) */
  166. udev = dev->bus->self;
  167. }
  168. /* Use the aer driver of the component firstly */
  169. driver = pcie_port_find_service(udev, service);
  170. if (driver && driver->reset_link) {
  171. status = driver->reset_link(udev);
  172. } else if (udev->has_secondary_link) {
  173. status = default_reset_link(udev);
  174. } else {
  175. pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
  176. pci_name(udev));
  177. return PCI_ERS_RESULT_DISCONNECT;
  178. }
  179. if (status != PCI_ERS_RESULT_RECOVERED) {
  180. pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
  181. pci_name(udev));
  182. return PCI_ERS_RESULT_DISCONNECT;
  183. }
  184. return status;
  185. }
  186. /**
  187. * broadcast_error_message - handle message broadcast to downstream drivers
  188. * @dev: pointer to from where in a hierarchy message is broadcasted down
  189. * @state: error state
  190. * @error_mesg: message to print
  191. * @cb: callback to be broadcasted
  192. *
  193. * Invoked during error recovery process. Once being invoked, the content
  194. * of error severity will be broadcasted to all downstream drivers in a
  195. * hierarchy in question.
  196. */
  197. static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
  198. enum pci_channel_state state,
  199. char *error_mesg,
  200. int (*cb)(struct pci_dev *, void *))
  201. {
  202. struct aer_broadcast_data result_data;
  203. pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
  204. result_data.state = state;
  205. if (cb == report_error_detected)
  206. result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
  207. else
  208. result_data.result = PCI_ERS_RESULT_RECOVERED;
  209. if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
  210. /*
  211. * If the error is reported by a bridge, we think this error
  212. * is related to the downstream link of the bridge, so we
  213. * do error recovery on all subordinates of the bridge instead
  214. * of the bridge and clear the error status of the bridge.
  215. */
  216. if (cb == report_error_detected)
  217. dev->error_state = state;
  218. pci_walk_bus(dev->subordinate, cb, &result_data);
  219. if (cb == report_resume) {
  220. pci_cleanup_aer_uncorrect_error_status(dev);
  221. dev->error_state = pci_channel_io_normal;
  222. }
  223. } else {
  224. /*
  225. * If the error is reported by an end point, we think this
  226. * error is related to the upstream link of the end point.
  227. */
  228. if (state == pci_channel_io_normal)
  229. /*
  230. * the error is non fatal so the bus is ok, just invoke
  231. * the callback for the function that logged the error.
  232. */
  233. cb(dev, &result_data);
  234. else
  235. pci_walk_bus(dev->bus, cb, &result_data);
  236. }
  237. return result_data.result;
  238. }
  239. /**
  240. * pcie_do_fatal_recovery - handle fatal error recovery process
  241. * @dev: pointer to a pci_dev data structure of agent detecting an error
  242. *
  243. * Invoked when an error is fatal. Once being invoked, removes the devices
  244. * beneath this AER agent, followed by reset link e.g. secondary bus reset
  245. * followed by re-enumeration of devices.
  246. */
  247. void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
  248. {
  249. struct pci_dev *udev;
  250. struct pci_bus *parent;
  251. struct pci_dev *pdev, *temp;
  252. pci_ers_result_t result;
  253. if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
  254. udev = dev;
  255. else
  256. udev = dev->bus->self;
  257. parent = udev->subordinate;
  258. pci_lock_rescan_remove();
  259. list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
  260. bus_list) {
  261. pci_dev_get(pdev);
  262. pci_dev_set_disconnected(pdev, NULL);
  263. if (pci_has_subordinate(pdev))
  264. pci_walk_bus(pdev->subordinate,
  265. pci_dev_set_disconnected, NULL);
  266. pci_stop_and_remove_bus_device(pdev);
  267. pci_dev_put(pdev);
  268. }
  269. result = reset_link(udev, service);
  270. if ((service == PCIE_PORT_SERVICE_AER) &&
  271. (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
  272. /*
  273. * If the error is reported by a bridge, we think this error
  274. * is related to the downstream link of the bridge, so we
  275. * do error recovery on all subordinates of the bridge instead
  276. * of the bridge and clear the error status of the bridge.
  277. */
  278. pci_cleanup_aer_uncorrect_error_status(dev);
  279. }
  280. if (result == PCI_ERS_RESULT_RECOVERED) {
  281. if (pcie_wait_for_link(udev, true))
  282. pci_rescan_bus(udev->bus);
  283. pci_info(dev, "Device recovery from fatal error successful\n");
  284. } else {
  285. pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
  286. pci_info(dev, "Device recovery from fatal error failed\n");
  287. }
  288. pci_unlock_rescan_remove();
  289. }
  290. /**
  291. * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  292. * @dev: pointer to a pci_dev data structure of agent detecting an error
  293. *
  294. * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  295. * error detected message to all downstream drivers within a hierarchy in
  296. * question and return the returned code.
  297. */
  298. void pcie_do_nonfatal_recovery(struct pci_dev *dev)
  299. {
  300. pci_ers_result_t status;
  301. enum pci_channel_state state;
  302. state = pci_channel_io_normal;
  303. status = broadcast_error_message(dev,
  304. state,
  305. "error_detected",
  306. report_error_detected);
  307. if (status == PCI_ERS_RESULT_CAN_RECOVER)
  308. status = broadcast_error_message(dev,
  309. state,
  310. "mmio_enabled",
  311. report_mmio_enabled);
  312. if (status == PCI_ERS_RESULT_NEED_RESET) {
  313. /*
  314. * TODO: Should call platform-specific
  315. * functions to reset slot before calling
  316. * drivers' slot_reset callbacks?
  317. */
  318. status = broadcast_error_message(dev,
  319. state,
  320. "slot_reset",
  321. report_slot_reset);
  322. }
  323. if (status != PCI_ERS_RESULT_RECOVERED)
  324. goto failed;
  325. broadcast_error_message(dev,
  326. state,
  327. "resume",
  328. report_resume);
  329. pci_info(dev, "AER: Device recovery successful\n");
  330. return;
  331. failed:
  332. pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
  333. /* TODO: Should kernel panic here? */
  334. pci_info(dev, "AER: Device recovery failed\n");
  335. }