|
@@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
|
|
|
pe->phb->global_number, pe->addr,
|
|
|
pe->freeze_count);
|
|
|
- goto hard_fail;
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
}
|
|
|
- pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
|
|
|
- pe->freeze_count, eeh_max_freezes);
|
|
|
|
|
|
/* Walk the various device drivers attached to this slot through
|
|
|
* a reset sequence, giving each an opportunity to do what it needs
|
|
@@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
* the error. Override the result if necessary to have partially
|
|
|
* hotplug for this case.
|
|
|
*/
|
|
|
- pr_info("EEH: Notify device drivers to shutdown\n");
|
|
|
- eeh_set_channel_state(pe, pci_channel_io_frozen);
|
|
|
- eeh_set_irq_state(pe, false);
|
|
|
- eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error,
|
|
|
- &result);
|
|
|
- if ((pe->type & EEH_PE_PHB) &&
|
|
|
- result != PCI_ERS_RESULT_NONE &&
|
|
|
- result != PCI_ERS_RESULT_NEED_RESET)
|
|
|
- result = PCI_ERS_RESULT_NEED_RESET;
|
|
|
+ if (result != PCI_ERS_RESULT_DISCONNECT) {
|
|
|
+ pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
|
|
|
+ pe->freeze_count, eeh_max_freezes);
|
|
|
+ pr_info("EEH: Notify device drivers to shutdown\n");
|
|
|
+ eeh_set_channel_state(pe, pci_channel_io_frozen);
|
|
|
+ eeh_set_irq_state(pe, false);
|
|
|
+ eeh_pe_report("error_detected(IO frozen)", pe,
|
|
|
+ eeh_report_error, &result);
|
|
|
+ if ((pe->type & EEH_PE_PHB) &&
|
|
|
+ result != PCI_ERS_RESULT_NONE &&
|
|
|
+ result != PCI_ERS_RESULT_NEED_RESET)
|
|
|
+ result = PCI_ERS_RESULT_NEED_RESET;
|
|
|
+ }
|
|
|
|
|
|
/* Get the current PCI slot state. This can take a long time,
|
|
|
* sometimes over 300 seconds for certain systems.
|
|
|
*/
|
|
|
- rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
|
|
|
- if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
|
|
|
- pr_warn("EEH: Permanent failure\n");
|
|
|
- goto hard_fail;
|
|
|
+ if (result != PCI_ERS_RESULT_DISCONNECT) {
|
|
|
+ rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
|
|
|
+ if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
|
|
|
+ pr_warn("EEH: Permanent failure\n");
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* Since rtas may enable MMIO when posting the error log,
|
|
|
* don't post the error log until after all dev drivers
|
|
|
* have been informed.
|
|
|
*/
|
|
|
- pr_info("EEH: Collect temporary log\n");
|
|
|
- eeh_slot_error_detail(pe, EEH_LOG_TEMP);
|
|
|
+ if (result != PCI_ERS_RESULT_DISCONNECT) {
|
|
|
+ pr_info("EEH: Collect temporary log\n");
|
|
|
+ eeh_slot_error_detail(pe, EEH_LOG_TEMP);
|
|
|
+ }
|
|
|
|
|
|
/* If all device drivers were EEH-unaware, then shut
|
|
|
* down all of the device drivers, and hope they
|
|
@@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
if (rc) {
|
|
|
pr_warn("%s: Unable to reset, err=%d\n",
|
|
|
__func__, rc);
|
|
|
- goto hard_fail;
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
pr_info("EEH: Enable I/O for affected devices\n");
|
|
|
rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
|
|
|
|
|
|
- if (rc < 0)
|
|
|
- goto hard_fail;
|
|
|
- if (rc) {
|
|
|
+ if (rc < 0) {
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ } else if (rc) {
|
|
|
result = PCI_ERS_RESULT_NEED_RESET;
|
|
|
} else {
|
|
|
pr_info("EEH: Notify device drivers to resume I/O\n");
|
|
@@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
pr_info("EEH: Enabled DMA for affected devices\n");
|
|
|
rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
|
|
|
|
|
|
- if (rc < 0)
|
|
|
- goto hard_fail;
|
|
|
- if (rc) {
|
|
|
+ if (rc < 0) {
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ } else if (rc) {
|
|
|
result = PCI_ERS_RESULT_NEED_RESET;
|
|
|
} else {
|
|
|
/*
|
|
@@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /* If any device has a hard failure, then shut off everything. */
|
|
|
- if (result == PCI_ERS_RESULT_DISCONNECT) {
|
|
|
- pr_warn("EEH: Device driver gave up\n");
|
|
|
- goto hard_fail;
|
|
|
- }
|
|
|
-
|
|
|
/* If any device called out for a reset, then reset the slot */
|
|
|
if (result == PCI_ERS_RESULT_NEED_RESET) {
|
|
|
pr_info("EEH: Reset without hotplug activity\n");
|
|
@@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
|
|
|
if (rc) {
|
|
|
pr_warn("%s: Cannot reset, err=%d\n",
|
|
|
__func__, rc);
|
|
|
- goto hard_fail;
|
|
|
+ result = PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ } else {
|
|
|
+ result = PCI_ERS_RESULT_NONE;
|
|
|
+ eeh_set_channel_state(pe, pci_channel_io_normal);
|
|
|
+ eeh_set_irq_state(pe, true);
|
|
|
+ eeh_pe_report("slot_reset", pe, eeh_report_reset,
|
|
|
+ &result);
|
|
|
}
|
|
|
-
|
|
|
- pr_info("EEH: Notify device drivers "
|
|
|
- "the completion of reset\n");
|
|
|
- result = PCI_ERS_RESULT_NONE;
|
|
|
- eeh_set_channel_state(pe, pci_channel_io_normal);
|
|
|
- eeh_set_irq_state(pe, true);
|
|
|
- eeh_pe_report("slot_reset", pe, eeh_report_reset, &result);
|
|
|
}
|
|
|
|
|
|
- /* All devices should claim they have recovered by now. */
|
|
|
- if ((result != PCI_ERS_RESULT_RECOVERED) &&
|
|
|
- (result != PCI_ERS_RESULT_NONE)) {
|
|
|
- pr_warn("EEH: Not recovered\n");
|
|
|
- goto hard_fail;
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- * For those hot removed VFs, we should add back them after PF get
|
|
|
- * recovered properly.
|
|
|
- */
|
|
|
- list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
|
|
|
- rmv_entry) {
|
|
|
- eeh_add_virt_device(edev);
|
|
|
- list_del(&edev->rmv_entry);
|
|
|
- }
|
|
|
-
|
|
|
- /* Tell all device drivers that they can resume operations */
|
|
|
- pr_info("EEH: Notify device driver to resume\n");
|
|
|
- eeh_set_channel_state(pe, pci_channel_io_normal);
|
|
|
- eeh_set_irq_state(pe, true);
|
|
|
- eeh_pe_report("resume", pe, eeh_report_resume, NULL);
|
|
|
- eeh_for_each_pe(pe, tmp_pe) {
|
|
|
- eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
|
|
|
- edev->mode &= ~EEH_DEV_NO_HANDLER;
|
|
|
- edev->in_error = false;
|
|
|
+ if ((result == PCI_ERS_RESULT_RECOVERED) ||
|
|
|
+ (result == PCI_ERS_RESULT_NONE)) {
|
|
|
+ /*
|
|
|
+ * For those hot removed VFs, we should add back them after PF
|
|
|
+ * get recovered properly.
|
|
|
+ */
|
|
|
+ list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
|
|
|
+ rmv_entry) {
|
|
|
+ eeh_add_virt_device(edev);
|
|
|
+ list_del(&edev->rmv_entry);
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- pr_info("EEH: Recovery successful.\n");
|
|
|
- goto final;
|
|
|
+ /* Tell all device drivers that they can resume operations */
|
|
|
+ pr_info("EEH: Notify device driver to resume\n");
|
|
|
+ eeh_set_channel_state(pe, pci_channel_io_normal);
|
|
|
+ eeh_set_irq_state(pe, true);
|
|
|
+ eeh_pe_report("resume", pe, eeh_report_resume, NULL);
|
|
|
+ eeh_for_each_pe(pe, tmp_pe) {
|
|
|
+ eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
|
|
|
+ edev->mode &= ~EEH_DEV_NO_HANDLER;
|
|
|
+ edev->in_error = false;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
-hard_fail:
|
|
|
- /*
|
|
|
- * About 90% of all real-life EEH failures in the field
|
|
|
- * are due to poorly seated PCI cards. Only 10% or so are
|
|
|
- * due to actual, failed cards.
|
|
|
- */
|
|
|
- pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
|
|
|
- "Please try reseating or replacing it\n",
|
|
|
- pe->phb->global_number, pe->addr);
|
|
|
+ pr_info("EEH: Recovery successful.\n");
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * About 90% of all real-life EEH failures in the field
|
|
|
+ * are due to poorly seated PCI cards. Only 10% or so are
|
|
|
+ * due to actual, failed cards.
|
|
|
+ */
|
|
|
+ pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
|
|
|
+ "Please try reseating or replacing it\n",
|
|
|
+ pe->phb->global_number, pe->addr);
|
|
|
|
|
|
- eeh_slot_error_detail(pe, EEH_LOG_PERM);
|
|
|
+ eeh_slot_error_detail(pe, EEH_LOG_PERM);
|
|
|
|
|
|
- /* Notify all devices that they're about to go down. */
|
|
|
- eeh_set_channel_state(pe, pci_channel_io_perm_failure);
|
|
|
- eeh_set_irq_state(pe, false);
|
|
|
- eeh_pe_report("error_detected(permanent failure)", pe,
|
|
|
- eeh_report_failure, NULL);
|
|
|
+ /* Notify all devices that they're about to go down. */
|
|
|
+ eeh_set_channel_state(pe, pci_channel_io_perm_failure);
|
|
|
+ eeh_set_irq_state(pe, false);
|
|
|
+ eeh_pe_report("error_detected(permanent failure)", pe,
|
|
|
+ eeh_report_failure, NULL);
|
|
|
|
|
|
- /* Mark the PE to be removed permanently */
|
|
|
- eeh_pe_state_mark(pe, EEH_PE_REMOVED);
|
|
|
+ /* Mark the PE to be removed permanently */
|
|
|
+ eeh_pe_state_mark(pe, EEH_PE_REMOVED);
|
|
|
|
|
|
- /*
|
|
|
- * Shut down the device drivers for good. We mark
|
|
|
- * all removed devices correctly to avoid access
|
|
|
- * the their PCI config any more.
|
|
|
- */
|
|
|
- if (pe->type & EEH_PE_VF) {
|
|
|
- eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
|
|
|
- eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
|
|
|
- } else {
|
|
|
- eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
|
|
|
- eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
|
|
|
+ /*
|
|
|
+ * Shut down the device drivers for good. We mark
|
|
|
+ * all removed devices correctly to avoid access
|
|
|
+ * the their PCI config any more.
|
|
|
+ */
|
|
|
+ if (pe->type & EEH_PE_VF) {
|
|
|
+ eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
|
|
|
+ eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
|
|
|
+ } else {
|
|
|
+ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
|
|
|
+ eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
|
|
|
|
|
|
- pci_lock_rescan_remove();
|
|
|
- pci_hp_remove_devices(bus);
|
|
|
- pci_unlock_rescan_remove();
|
|
|
- /* The passed PE should no longer be used */
|
|
|
- return;
|
|
|
+ pci_lock_rescan_remove();
|
|
|
+ pci_hp_remove_devices(bus);
|
|
|
+ pci_unlock_rescan_remove();
|
|
|
+ /* The passed PE should no longer be used */
|
|
|
+ return;
|
|
|
+ }
|
|
|
}
|
|
|
-final:
|
|
|
eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
|
|
|
}
|
|
|
|