eeh-ioda.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890
  1. /*
  2. * The file intends to implement the functions needed by EEH, which is
  3. * built on IODA compliant chip. Actually, lots of functions related
  4. * to EEH would be built based on the OPAL APIs.
  5. *
  6. * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. */
  13. #include <linux/bootmem.h>
  14. #include <linux/debugfs.h>
  15. #include <linux/delay.h>
  16. #include <linux/io.h>
  17. #include <linux/irq.h>
  18. #include <linux/kernel.h>
  19. #include <linux/msi.h>
  20. #include <linux/notifier.h>
  21. #include <linux/pci.h>
  22. #include <linux/string.h>
  23. #include <asm/eeh.h>
  24. #include <asm/eeh_event.h>
  25. #include <asm/io.h>
  26. #include <asm/iommu.h>
  27. #include <asm/msi_bitmap.h>
  28. #include <asm/opal.h>
  29. #include <asm/pci-bridge.h>
  30. #include <asm/ppc-pci.h>
  31. #include <asm/tce.h>
  32. #include "powernv.h"
  33. #include "pci.h"
  34. static int ioda_eeh_nb_init = 0;
  35. static int ioda_eeh_event(struct notifier_block *nb,
  36. unsigned long events, void *change)
  37. {
  38. uint64_t changed_evts = (uint64_t)change;
  39. /*
  40. * We simply send special EEH event if EEH has
  41. * been enabled, or clear pending events in
  42. * case that we enable EEH soon
  43. */
  44. if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
  45. !(events & OPAL_EVENT_PCI_ERROR))
  46. return 0;
  47. if (eeh_enabled())
  48. eeh_send_failure_event(NULL);
  49. else
  50. opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
  51. return 0;
  52. }
  53. static struct notifier_block ioda_eeh_nb = {
  54. .notifier_call = ioda_eeh_event,
  55. .next = NULL,
  56. .priority = 0
  57. };
  58. #ifdef CONFIG_DEBUG_FS
  59. static int ioda_eeh_dbgfs_set(void *data, int offset, u64 val)
  60. {
  61. struct pci_controller *hose = data;
  62. struct pnv_phb *phb = hose->private_data;
  63. out_be64(phb->regs + offset, val);
  64. return 0;
  65. }
  66. static int ioda_eeh_dbgfs_get(void *data, int offset, u64 *val)
  67. {
  68. struct pci_controller *hose = data;
  69. struct pnv_phb *phb = hose->private_data;
  70. *val = in_be64(phb->regs + offset);
  71. return 0;
  72. }
  73. static int ioda_eeh_outb_dbgfs_set(void *data, u64 val)
  74. {
  75. return ioda_eeh_dbgfs_set(data, 0xD10, val);
  76. }
  77. static int ioda_eeh_outb_dbgfs_get(void *data, u64 *val)
  78. {
  79. return ioda_eeh_dbgfs_get(data, 0xD10, val);
  80. }
  81. static int ioda_eeh_inbA_dbgfs_set(void *data, u64 val)
  82. {
  83. return ioda_eeh_dbgfs_set(data, 0xD90, val);
  84. }
  85. static int ioda_eeh_inbA_dbgfs_get(void *data, u64 *val)
  86. {
  87. return ioda_eeh_dbgfs_get(data, 0xD90, val);
  88. }
  89. static int ioda_eeh_inbB_dbgfs_set(void *data, u64 val)
  90. {
  91. return ioda_eeh_dbgfs_set(data, 0xE10, val);
  92. }
  93. static int ioda_eeh_inbB_dbgfs_get(void *data, u64 *val)
  94. {
  95. return ioda_eeh_dbgfs_get(data, 0xE10, val);
  96. }
  97. DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_outb_dbgfs_ops, ioda_eeh_outb_dbgfs_get,
  98. ioda_eeh_outb_dbgfs_set, "0x%llx\n");
  99. DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbA_dbgfs_ops, ioda_eeh_inbA_dbgfs_get,
  100. ioda_eeh_inbA_dbgfs_set, "0x%llx\n");
  101. DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
  102. ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
  103. #endif /* CONFIG_DEBUG_FS */
  104. /**
  105. * ioda_eeh_post_init - Chip dependent post initialization
  106. * @hose: PCI controller
  107. *
  108. * The function will be called after eeh PEs and devices
  109. * have been built. That means the EEH is ready to supply
  110. * service with I/O cache.
  111. */
  112. static int ioda_eeh_post_init(struct pci_controller *hose)
  113. {
  114. struct pnv_phb *phb = hose->private_data;
  115. int ret;
  116. /* Register OPAL event notifier */
  117. if (!ioda_eeh_nb_init) {
  118. ret = opal_notifier_register(&ioda_eeh_nb);
  119. if (ret) {
  120. pr_err("%s: Can't register OPAL event notifier (%d)\n",
  121. __func__, ret);
  122. return ret;
  123. }
  124. ioda_eeh_nb_init = 1;
  125. }
  126. #ifdef CONFIG_DEBUG_FS
  127. if (!phb->has_dbgfs && phb->dbgfs) {
  128. phb->has_dbgfs = 1;
  129. debugfs_create_file("err_injct_outbound", 0600,
  130. phb->dbgfs, hose,
  131. &ioda_eeh_outb_dbgfs_ops);
  132. debugfs_create_file("err_injct_inboundA", 0600,
  133. phb->dbgfs, hose,
  134. &ioda_eeh_inbA_dbgfs_ops);
  135. debugfs_create_file("err_injct_inboundB", 0600,
  136. phb->dbgfs, hose,
  137. &ioda_eeh_inbB_dbgfs_ops);
  138. }
  139. #endif
  140. /* If EEH is enabled, we're going to rely on that.
  141. * Otherwise, we restore to conventional mechanism
  142. * to clear frozen PE during PCI config access.
  143. */
  144. if (eeh_enabled())
  145. phb->flags |= PNV_PHB_FLAG_EEH;
  146. else
  147. phb->flags &= ~PNV_PHB_FLAG_EEH;
  148. return 0;
  149. }
  150. /**
  151. * ioda_eeh_set_option - Set EEH operation or I/O setting
  152. * @pe: EEH PE
  153. * @option: options
  154. *
  155. * Enable or disable EEH option for the indicated PE. The
  156. * function also can be used to enable I/O or DMA for the
  157. * PE.
  158. */
  159. static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
  160. {
  161. s64 ret;
  162. u32 pe_no;
  163. struct pci_controller *hose = pe->phb;
  164. struct pnv_phb *phb = hose->private_data;
  165. /* Check on PE number */
  166. if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
  167. pr_err("%s: PE address %x out of range [0, %x] "
  168. "on PHB#%x\n",
  169. __func__, pe->addr, phb->ioda.total_pe,
  170. hose->global_number);
  171. return -EINVAL;
  172. }
  173. pe_no = pe->addr;
  174. switch (option) {
  175. case EEH_OPT_DISABLE:
  176. ret = -EEXIST;
  177. break;
  178. case EEH_OPT_ENABLE:
  179. ret = 0;
  180. break;
  181. case EEH_OPT_THAW_MMIO:
  182. ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
  183. OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO);
  184. if (ret) {
  185. pr_warning("%s: Failed to enable MMIO for "
  186. "PHB#%x-PE#%x, err=%lld\n",
  187. __func__, hose->global_number, pe_no, ret);
  188. return -EIO;
  189. }
  190. break;
  191. case EEH_OPT_THAW_DMA:
  192. ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
  193. OPAL_EEH_ACTION_CLEAR_FREEZE_DMA);
  194. if (ret) {
  195. pr_warning("%s: Failed to enable DMA for "
  196. "PHB#%x-PE#%x, err=%lld\n",
  197. __func__, hose->global_number, pe_no, ret);
  198. return -EIO;
  199. }
  200. break;
  201. default:
  202. pr_warning("%s: Invalid option %d\n", __func__, option);
  203. return -EINVAL;
  204. }
  205. return ret;
  206. }
  207. static void ioda_eeh_phb_diag(struct pci_controller *hose)
  208. {
  209. struct pnv_phb *phb = hose->private_data;
  210. long rc;
  211. rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
  212. PNV_PCI_DIAG_BUF_SIZE);
  213. if (rc != OPAL_SUCCESS) {
  214. pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
  215. __func__, hose->global_number, rc);
  216. return;
  217. }
  218. pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
  219. }
  220. /**
  221. * ioda_eeh_get_state - Retrieve the state of PE
  222. * @pe: EEH PE
  223. *
  224. * The PE's state should be retrieved from the PEEV, PEST
  225. * IODA tables. Since the OPAL has exported the function
  226. * to do it, it'd better to use that.
  227. */
  228. static int ioda_eeh_get_state(struct eeh_pe *pe)
  229. {
  230. s64 ret = 0;
  231. u8 fstate;
  232. __be16 pcierr;
  233. u32 pe_no;
  234. int result;
  235. struct pci_controller *hose = pe->phb;
  236. struct pnv_phb *phb = hose->private_data;
  237. /*
  238. * Sanity check on PE address. The PHB PE address should
  239. * be zero.
  240. */
  241. if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
  242. pr_err("%s: PE address %x out of range [0, %x] "
  243. "on PHB#%x\n",
  244. __func__, pe->addr, phb->ioda.total_pe,
  245. hose->global_number);
  246. return EEH_STATE_NOT_SUPPORT;
  247. }
  248. /*
  249. * If we're in middle of PE reset, return normal
  250. * state to keep EEH core going. For PHB reset, we
  251. * still expect to have fenced PHB cleared with
  252. * PHB reset.
  253. */
  254. if (!(pe->type & EEH_PE_PHB) &&
  255. (pe->state & EEH_PE_RESET)) {
  256. result = (EEH_STATE_MMIO_ACTIVE |
  257. EEH_STATE_DMA_ACTIVE |
  258. EEH_STATE_MMIO_ENABLED |
  259. EEH_STATE_DMA_ENABLED);
  260. return result;
  261. }
  262. /* Retrieve PE status through OPAL */
  263. pe_no = pe->addr;
  264. ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
  265. &fstate, &pcierr, NULL);
  266. if (ret) {
  267. pr_err("%s: Failed to get EEH status on "
  268. "PHB#%x-PE#%x\n, err=%lld\n",
  269. __func__, hose->global_number, pe_no, ret);
  270. return EEH_STATE_NOT_SUPPORT;
  271. }
  272. /* Check PHB status */
  273. if (pe->type & EEH_PE_PHB) {
  274. result = 0;
  275. result &= ~EEH_STATE_RESET_ACTIVE;
  276. if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
  277. result |= EEH_STATE_MMIO_ACTIVE;
  278. result |= EEH_STATE_DMA_ACTIVE;
  279. result |= EEH_STATE_MMIO_ENABLED;
  280. result |= EEH_STATE_DMA_ENABLED;
  281. } else if (!(pe->state & EEH_PE_ISOLATED)) {
  282. eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
  283. ioda_eeh_phb_diag(hose);
  284. }
  285. return result;
  286. }
  287. /* Parse result out */
  288. result = 0;
  289. switch (fstate) {
  290. case OPAL_EEH_STOPPED_NOT_FROZEN:
  291. result &= ~EEH_STATE_RESET_ACTIVE;
  292. result |= EEH_STATE_MMIO_ACTIVE;
  293. result |= EEH_STATE_DMA_ACTIVE;
  294. result |= EEH_STATE_MMIO_ENABLED;
  295. result |= EEH_STATE_DMA_ENABLED;
  296. break;
  297. case OPAL_EEH_STOPPED_MMIO_FREEZE:
  298. result &= ~EEH_STATE_RESET_ACTIVE;
  299. result |= EEH_STATE_DMA_ACTIVE;
  300. result |= EEH_STATE_DMA_ENABLED;
  301. break;
  302. case OPAL_EEH_STOPPED_DMA_FREEZE:
  303. result &= ~EEH_STATE_RESET_ACTIVE;
  304. result |= EEH_STATE_MMIO_ACTIVE;
  305. result |= EEH_STATE_MMIO_ENABLED;
  306. break;
  307. case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
  308. result &= ~EEH_STATE_RESET_ACTIVE;
  309. break;
  310. case OPAL_EEH_STOPPED_RESET:
  311. result |= EEH_STATE_RESET_ACTIVE;
  312. break;
  313. case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
  314. result |= EEH_STATE_UNAVAILABLE;
  315. break;
  316. case OPAL_EEH_STOPPED_PERM_UNAVAIL:
  317. result |= EEH_STATE_NOT_SUPPORT;
  318. break;
  319. default:
  320. pr_warning("%s: Unexpected EEH status 0x%x "
  321. "on PHB#%x-PE#%x\n",
  322. __func__, fstate, hose->global_number, pe_no);
  323. }
  324. /* Dump PHB diag-data for frozen PE */
  325. if (result != EEH_STATE_NOT_SUPPORT &&
  326. (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
  327. (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
  328. !(pe->state & EEH_PE_ISOLATED)) {
  329. eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
  330. ioda_eeh_phb_diag(hose);
  331. }
  332. return result;
  333. }
  334. static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
  335. {
  336. s64 rc = OPAL_HARDWARE;
  337. while (1) {
  338. rc = opal_pci_poll(phb->opal_id);
  339. if (rc <= 0)
  340. break;
  341. if (system_state < SYSTEM_RUNNING)
  342. udelay(1000 * rc);
  343. else
  344. msleep(rc);
  345. }
  346. return rc;
  347. }
  348. int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
  349. {
  350. struct pnv_phb *phb = hose->private_data;
  351. s64 rc = OPAL_HARDWARE;
  352. pr_debug("%s: Reset PHB#%x, option=%d\n",
  353. __func__, hose->global_number, option);
  354. /* Issue PHB complete reset request */
  355. if (option == EEH_RESET_FUNDAMENTAL ||
  356. option == EEH_RESET_HOT)
  357. rc = opal_pci_reset(phb->opal_id,
  358. OPAL_PHB_COMPLETE,
  359. OPAL_ASSERT_RESET);
  360. else if (option == EEH_RESET_DEACTIVATE)
  361. rc = opal_pci_reset(phb->opal_id,
  362. OPAL_PHB_COMPLETE,
  363. OPAL_DEASSERT_RESET);
  364. if (rc < 0)
  365. goto out;
  366. /*
  367. * Poll state of the PHB until the request is done
  368. * successfully. The PHB reset is usually PHB complete
  369. * reset followed by hot reset on root bus. So we also
  370. * need the PCI bus settlement delay.
  371. */
  372. rc = ioda_eeh_phb_poll(phb);
  373. if (option == EEH_RESET_DEACTIVATE) {
  374. if (system_state < SYSTEM_RUNNING)
  375. udelay(1000 * EEH_PE_RST_SETTLE_TIME);
  376. else
  377. msleep(EEH_PE_RST_SETTLE_TIME);
  378. }
  379. out:
  380. if (rc != OPAL_SUCCESS)
  381. return -EIO;
  382. return 0;
  383. }
  384. static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
  385. {
  386. struct pnv_phb *phb = hose->private_data;
  387. s64 rc = OPAL_SUCCESS;
  388. pr_debug("%s: Reset PHB#%x, option=%d\n",
  389. __func__, hose->global_number, option);
  390. /*
  391. * During the reset deassert time, we needn't care
  392. * the reset scope because the firmware does nothing
  393. * for fundamental or hot reset during deassert phase.
  394. */
  395. if (option == EEH_RESET_FUNDAMENTAL)
  396. rc = opal_pci_reset(phb->opal_id,
  397. OPAL_PCI_FUNDAMENTAL_RESET,
  398. OPAL_ASSERT_RESET);
  399. else if (option == EEH_RESET_HOT)
  400. rc = opal_pci_reset(phb->opal_id,
  401. OPAL_PCI_HOT_RESET,
  402. OPAL_ASSERT_RESET);
  403. else if (option == EEH_RESET_DEACTIVATE)
  404. rc = opal_pci_reset(phb->opal_id,
  405. OPAL_PCI_HOT_RESET,
  406. OPAL_DEASSERT_RESET);
  407. if (rc < 0)
  408. goto out;
  409. /* Poll state of the PHB until the request is done */
  410. rc = ioda_eeh_phb_poll(phb);
  411. if (option == EEH_RESET_DEACTIVATE)
  412. msleep(EEH_PE_RST_SETTLE_TIME);
  413. out:
  414. if (rc != OPAL_SUCCESS)
  415. return -EIO;
  416. return 0;
  417. }
  418. static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option)
  419. {
  420. struct device_node *dn = pci_device_to_OF_node(dev);
  421. struct eeh_dev *edev = of_node_to_eeh_dev(dn);
  422. int aer = edev ? edev->aer_cap : 0;
  423. u32 ctrl;
  424. pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
  425. __func__, pci_domain_nr(dev->bus),
  426. dev->bus->number, option);
  427. switch (option) {
  428. case EEH_RESET_FUNDAMENTAL:
  429. case EEH_RESET_HOT:
  430. /* Don't report linkDown event */
  431. if (aer) {
  432. eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
  433. 4, &ctrl);
  434. ctrl |= PCI_ERR_UNC_SURPDN;
  435. eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
  436. 4, ctrl);
  437. }
  438. eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
  439. ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
  440. eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
  441. msleep(EEH_PE_RST_HOLD_TIME);
  442. break;
  443. case EEH_RESET_DEACTIVATE:
  444. eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
  445. ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
  446. eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
  447. msleep(EEH_PE_RST_SETTLE_TIME);
  448. /* Continue reporting linkDown event */
  449. if (aer) {
  450. eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
  451. 4, &ctrl);
  452. ctrl &= ~PCI_ERR_UNC_SURPDN;
  453. eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
  454. 4, ctrl);
  455. }
  456. break;
  457. }
  458. return 0;
  459. }
  460. void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
  461. {
  462. struct pci_controller *hose;
  463. if (pci_is_root_bus(dev->bus)) {
  464. hose = pci_bus_to_host(dev->bus);
  465. ioda_eeh_root_reset(hose, EEH_RESET_HOT);
  466. ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
  467. } else {
  468. ioda_eeh_bridge_reset(dev, EEH_RESET_HOT);
  469. ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
  470. }
  471. }
  472. /**
  473. * ioda_eeh_reset - Reset the indicated PE
  474. * @pe: EEH PE
  475. * @option: reset option
  476. *
  477. * Do reset on the indicated PE. For PCI bus sensitive PE,
  478. * we need to reset the parent p2p bridge. The PHB has to
  479. * be reinitialized if the p2p bridge is root bridge. For
  480. * PCI device sensitive PE, we will try to reset the device
  481. * through FLR. For now, we don't have OPAL APIs to do HARD
  482. * reset yet, so all reset would be SOFT (HOT) reset.
  483. */
  484. static int ioda_eeh_reset(struct eeh_pe *pe, int option)
  485. {
  486. struct pci_controller *hose = pe->phb;
  487. struct pci_bus *bus;
  488. int ret;
  489. /*
  490. * For PHB reset, we always have complete reset. For those PEs whose
  491. * primary bus derived from root complex (root bus) or root port
  492. * (usually bus#1), we apply hot or fundamental reset on the root port.
  493. * For other PEs, we always have hot reset on the PE primary bus.
  494. *
  495. * Here, we have different design to pHyp, which always clear the
  496. * frozen state during PE reset. However, the good idea here from
  497. * benh is to keep frozen state before we get PE reset done completely
  498. * (until BAR restore). With the frozen state, HW drops illegal IO
  499. * or MMIO access, which can incur recrusive frozen PE during PE
  500. * reset. The side effect is that EEH core has to clear the frozen
  501. * state explicitly after BAR restore.
  502. */
  503. if (pe->type & EEH_PE_PHB) {
  504. ret = ioda_eeh_phb_reset(hose, option);
  505. } else {
  506. bus = eeh_pe_bus_get(pe);
  507. if (pci_is_root_bus(bus) ||
  508. pci_is_root_bus(bus->parent))
  509. ret = ioda_eeh_root_reset(hose, option);
  510. else
  511. ret = ioda_eeh_bridge_reset(bus->self, option);
  512. }
  513. return ret;
  514. }
  515. /**
  516. * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
  517. * @pe: EEH PE
  518. *
  519. * For particular PE, it might have included PCI bridges. In order
  520. * to make the PE work properly, those PCI bridges should be configured
  521. * correctly. However, we need do nothing on P7IOC since the reset
  522. * function will do everything that should be covered by the function.
  523. */
  524. static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
  525. {
  526. return 0;
  527. }
  528. static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
  529. {
  530. /* GEM */
  531. pr_info(" GEM XFIR: %016llx\n", data->gemXfir);
  532. pr_info(" GEM RFIR: %016llx\n", data->gemRfir);
  533. pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir);
  534. pr_info(" GEM Mask: %016llx\n", data->gemMask);
  535. pr_info(" GEM RWOF: %016llx\n", data->gemRwof);
  536. /* LEM */
  537. pr_info(" LEM FIR: %016llx\n", data->lemFir);
  538. pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask);
  539. pr_info(" LEM Action 0: %016llx\n", data->lemAction0);
  540. pr_info(" LEM Action 1: %016llx\n", data->lemAction1);
  541. pr_info(" LEM WOF: %016llx\n", data->lemWof);
  542. }
  543. static void ioda_eeh_hub_diag(struct pci_controller *hose)
  544. {
  545. struct pnv_phb *phb = hose->private_data;
  546. struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag;
  547. long rc;
  548. rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
  549. if (rc != OPAL_SUCCESS) {
  550. pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
  551. __func__, phb->hub_id, rc);
  552. return;
  553. }
  554. switch (data->type) {
  555. case OPAL_P7IOC_DIAG_TYPE_RGC:
  556. pr_info("P7IOC diag-data for RGC\n\n");
  557. ioda_eeh_hub_diag_common(data);
  558. pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus);
  559. pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp);
  560. break;
  561. case OPAL_P7IOC_DIAG_TYPE_BI:
  562. pr_info("P7IOC diag-data for BI %s\n\n",
  563. data->bi.biDownbound ? "Downbound" : "Upbound");
  564. ioda_eeh_hub_diag_common(data);
  565. pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0);
  566. pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1);
  567. pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2);
  568. pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus);
  569. break;
  570. case OPAL_P7IOC_DIAG_TYPE_CI:
  571. pr_info("P7IOC diag-data for CI Port %d\\nn",
  572. data->ci.ciPort);
  573. ioda_eeh_hub_diag_common(data);
  574. pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus);
  575. pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp);
  576. break;
  577. case OPAL_P7IOC_DIAG_TYPE_MISC:
  578. pr_info("P7IOC diag-data for MISC\n\n");
  579. ioda_eeh_hub_diag_common(data);
  580. break;
  581. case OPAL_P7IOC_DIAG_TYPE_I2C:
  582. pr_info("P7IOC diag-data for I2C\n\n");
  583. ioda_eeh_hub_diag_common(data);
  584. break;
  585. default:
  586. pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
  587. __func__, phb->hub_id, data->type);
  588. }
  589. }
  590. static int ioda_eeh_get_pe(struct pci_controller *hose,
  591. u16 pe_no, struct eeh_pe **pe)
  592. {
  593. struct eeh_pe *phb_pe, *dev_pe;
  594. struct eeh_dev dev;
  595. /* Find the PHB PE */
  596. phb_pe = eeh_phb_pe_get(hose);
  597. if (!phb_pe)
  598. return -EEXIST;
  599. /* Find the PE according to PE# */
  600. memset(&dev, 0, sizeof(struct eeh_dev));
  601. dev.phb = hose;
  602. dev.pe_config_addr = pe_no;
  603. dev_pe = eeh_pe_get(&dev);
  604. if (!dev_pe) return -EEXIST;
  605. *pe = dev_pe;
  606. return 0;
  607. }
  608. /**
  609. * ioda_eeh_next_error - Retrieve next error for EEH core to handle
  610. * @pe: The affected PE
  611. *
  612. * The function is expected to be called by EEH core while it gets
  613. * special EEH event (without binding PE). The function calls to
  614. * OPAL APIs for next error to handle. The informational error is
  615. * handled internally by platform. However, the dead IOC, dead PHB,
  616. * fenced PHB and frozen PE should be handled by EEH core eventually.
  617. */
  618. static int ioda_eeh_next_error(struct eeh_pe **pe)
  619. {
  620. struct pci_controller *hose;
  621. struct pnv_phb *phb;
  622. struct eeh_pe *phb_pe, *parent_pe;
  623. __be64 frozen_pe_no;
  624. __be16 err_type, severity;
  625. int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
  626. long rc;
  627. int state, ret = EEH_NEXT_ERR_NONE;
  628. /*
  629. * While running here, it's safe to purge the event queue.
  630. * And we should keep the cached OPAL notifier event sychronized
  631. * between the kernel and firmware.
  632. */
  633. eeh_remove_event(NULL, false);
  634. opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
  635. list_for_each_entry(hose, &hose_list, list_node) {
  636. /*
  637. * If the subordinate PCI buses of the PHB has been
  638. * removed or is exactly under error recovery, we
  639. * needn't take care of it any more.
  640. */
  641. phb = hose->private_data;
  642. phb_pe = eeh_phb_pe_get(hose);
  643. if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
  644. continue;
  645. rc = opal_pci_next_error(phb->opal_id,
  646. &frozen_pe_no, &err_type, &severity);
  647. /* If OPAL API returns error, we needn't proceed */
  648. if (rc != OPAL_SUCCESS) {
  649. pr_devel("%s: Invalid return value on "
  650. "PHB#%x (0x%lx) from opal_pci_next_error",
  651. __func__, hose->global_number, rc);
  652. continue;
  653. }
  654. /* If the PHB doesn't have error, stop processing */
  655. if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
  656. be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
  657. pr_devel("%s: No error found on PHB#%x\n",
  658. __func__, hose->global_number);
  659. continue;
  660. }
  661. /*
  662. * Processing the error. We're expecting the error with
  663. * highest priority reported upon multiple errors on the
  664. * specific PHB.
  665. */
  666. pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
  667. __func__, be16_to_cpu(err_type), be16_to_cpu(severity),
  668. be64_to_cpu(frozen_pe_no), hose->global_number);
  669. switch (be16_to_cpu(err_type)) {
  670. case OPAL_EEH_IOC_ERROR:
  671. if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
  672. pr_err("EEH: dead IOC detected\n");
  673. ret = EEH_NEXT_ERR_DEAD_IOC;
  674. } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
  675. pr_info("EEH: IOC informative error "
  676. "detected\n");
  677. ioda_eeh_hub_diag(hose);
  678. ret = EEH_NEXT_ERR_NONE;
  679. }
  680. break;
  681. case OPAL_EEH_PHB_ERROR:
  682. if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
  683. *pe = phb_pe;
  684. pr_err("EEH: dead PHB#%x detected, "
  685. "location: %s\n",
  686. hose->global_number,
  687. eeh_pe_loc_get(phb_pe));
  688. ret = EEH_NEXT_ERR_DEAD_PHB;
  689. } else if (be16_to_cpu(severity) ==
  690. OPAL_EEH_SEV_PHB_FENCED) {
  691. *pe = phb_pe;
  692. pr_err("EEH: Fenced PHB#%x detected, "
  693. "location: %s\n",
  694. hose->global_number,
  695. eeh_pe_loc_get(phb_pe));
  696. ret = EEH_NEXT_ERR_FENCED_PHB;
  697. } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
  698. pr_info("EEH: PHB#%x informative error "
  699. "detected, location: %s\n",
  700. hose->global_number,
  701. eeh_pe_loc_get(phb_pe));
  702. ioda_eeh_phb_diag(hose);
  703. ret = EEH_NEXT_ERR_NONE;
  704. }
  705. break;
  706. case OPAL_EEH_PE_ERROR:
  707. /*
  708. * If we can't find the corresponding PE, we
  709. * just try to unfreeze.
  710. */
  711. if (ioda_eeh_get_pe(hose,
  712. be64_to_cpu(frozen_pe_no), pe)) {
  713. /* Try best to clear it */
  714. pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
  715. hose->global_number, frozen_pe_no);
  716. pr_info("EEH: PHB location: %s\n",
  717. eeh_pe_loc_get(phb_pe));
  718. opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
  719. OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
  720. ret = EEH_NEXT_ERR_NONE;
  721. } else if ((*pe)->state & EEH_PE_ISOLATED) {
  722. ret = EEH_NEXT_ERR_NONE;
  723. } else {
  724. pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
  725. (*pe)->addr, (*pe)->phb->global_number);
  726. pr_err("EEH: PE location: %s, PHB location: %s\n",
  727. eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe));
  728. ret = EEH_NEXT_ERR_FROZEN_PE;
  729. }
  730. break;
  731. default:
  732. pr_warn("%s: Unexpected error type %d\n",
  733. __func__, be16_to_cpu(err_type));
  734. }
  735. /*
  736. * EEH core will try recover from fenced PHB or
  737. * frozen PE. In the time for frozen PE, EEH core
  738. * enable IO path for that before collecting logs,
  739. * but it ruins the site. So we have to dump the
  740. * log in advance here.
  741. */
  742. if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
  743. ret == EEH_NEXT_ERR_FENCED_PHB) &&
  744. !((*pe)->state & EEH_PE_ISOLATED)) {
  745. eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
  746. ioda_eeh_phb_diag(hose);
  747. }
  748. /*
  749. * We probably have the frozen parent PE out there and
  750. * we need have to handle frozen parent PE firstly.
  751. */
  752. if (ret == EEH_NEXT_ERR_FROZEN_PE) {
  753. parent_pe = (*pe)->parent;
  754. while (parent_pe) {
  755. /* Hit the ceiling ? */
  756. if (parent_pe->type & EEH_PE_PHB)
  757. break;
  758. /* Frozen parent PE ? */
  759. state = ioda_eeh_get_state(parent_pe);
  760. if (state > 0 &&
  761. (state & active_flags) != active_flags)
  762. *pe = parent_pe;
  763. /* Next parent level */
  764. parent_pe = parent_pe->parent;
  765. }
  766. /* We possibly migrate to another PE */
  767. eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
  768. }
  769. /*
  770. * If we have no errors on the specific PHB or only
  771. * informative error there, we continue poking it.
  772. * Otherwise, we need actions to be taken by upper
  773. * layer.
  774. */
  775. if (ret > EEH_NEXT_ERR_INF)
  776. break;
  777. }
  778. return ret;
  779. }
  780. struct pnv_eeh_ops ioda_eeh_ops = {
  781. .post_init = ioda_eeh_post_init,
  782. .set_option = ioda_eeh_set_option,
  783. .get_state = ioda_eeh_get_state,
  784. .reset = ioda_eeh_reset,
  785. .configure_bridge = ioda_eeh_configure_bridge,
  786. .next_error = ioda_eeh_next_error
  787. };