|
@@ -98,7 +98,6 @@ struct nvme_dev {
|
|
|
unsigned long bar_mapped_size;
|
|
|
struct work_struct reset_work;
|
|
|
struct work_struct remove_work;
|
|
|
- struct timer_list watchdog_timer;
|
|
|
struct mutex shutdown_lock;
|
|
|
bool subsystem;
|
|
|
void __iomem *cmb;
|
|
@@ -960,6 +959,51 @@ static void abort_endio(struct request *req, blk_status_t error)
|
|
|
blk_mq_free_request(req);
|
|
|
}
|
|
|
|
|
|
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
|
|
|
+{
|
|
|
+
|
|
|
+ /* If true, indicates loss of adapter communication, possibly by a
|
|
|
+ * NVMe Subsystem reset.
|
|
|
+ */
|
|
|
+ bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
|
|
|
+
|
|
|
+ /* If there is a reset ongoing, we shouldn't reset again. */
|
|
|
+ if (dev->ctrl.state == NVME_CTRL_RESETTING)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* We shouldn't reset unless the controller is on fatal error state
|
|
|
+ * _or_ if we lost the communication with it.
|
|
|
+ */
|
|
|
+ if (!(csts & NVME_CSTS_CFS) && !nssro)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* If PCI error recovery process is happening, we cannot reset or
|
|
|
+ * the recovery mechanism will surely fail.
|
|
|
+ */
|
|
|
+ if (pci_channel_offline(to_pci_dev(dev->dev)))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
|
|
|
+{
|
|
|
+ /* Read a config register to help see what died. */
|
|
|
+ u16 pci_status;
|
|
|
+ int result;
|
|
|
+
|
|
|
+ result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
|
|
|
+ &pci_status);
|
|
|
+ if (result == PCIBIOS_SUCCESSFUL)
|
|
|
+ dev_warn(dev->ctrl.device,
|
|
|
+ "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
|
|
|
+ csts, pci_status);
|
|
|
+ else
|
|
|
+ dev_warn(dev->ctrl.device,
|
|
|
+ "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
|
|
|
+ csts, result);
|
|
|
+}
|
|
|
+
|
|
|
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
|
|
|
{
|
|
|
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
|
@@ -967,6 +1011,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
|
|
|
struct nvme_dev *dev = nvmeq->dev;
|
|
|
struct request *abort_req;
|
|
|
struct nvme_command cmd;
|
|
|
+ u32 csts = readl(dev->bar + NVME_REG_CSTS);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset immediately if the controller is failed
|
|
|
+ */
|
|
|
+ if (nvme_should_reset(dev, csts)) {
|
|
|
+ nvme_warn_reset(dev, csts);
|
|
|
+ nvme_dev_disable(dev, false);
|
|
|
+ nvme_reset(dev);
|
|
|
+ return BLK_EH_HANDLED;
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* Did we miss an interrupt?
|
|
@@ -1398,66 +1453,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
|
|
|
-{
|
|
|
-
|
|
|
- /* If true, indicates loss of adapter communication, possibly by a
|
|
|
- * NVMe Subsystem reset.
|
|
|
- */
|
|
|
- bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
|
|
|
-
|
|
|
- /* If there is a reset ongoing, we shouldn't reset again. */
|
|
|
- if (dev->ctrl.state == NVME_CTRL_RESETTING)
|
|
|
- return false;
|
|
|
-
|
|
|
- /* We shouldn't reset unless the controller is on fatal error state
|
|
|
- * _or_ if we lost the communication with it.
|
|
|
- */
|
|
|
- if (!(csts & NVME_CSTS_CFS) && !nssro)
|
|
|
- return false;
|
|
|
-
|
|
|
- /* If PCI error recovery process is happening, we cannot reset or
|
|
|
- * the recovery mechanism will surely fail.
|
|
|
- */
|
|
|
- if (pci_channel_offline(to_pci_dev(dev->dev)))
|
|
|
- return false;
|
|
|
-
|
|
|
- return true;
|
|
|
-}
|
|
|
-
|
|
|
-static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
|
|
|
-{
|
|
|
- /* Read a config register to help see what died. */
|
|
|
- u16 pci_status;
|
|
|
- int result;
|
|
|
-
|
|
|
- result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
|
|
|
- &pci_status);
|
|
|
- if (result == PCIBIOS_SUCCESSFUL)
|
|
|
- dev_warn(dev->ctrl.device,
|
|
|
- "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
|
|
|
- csts, pci_status);
|
|
|
- else
|
|
|
- dev_warn(dev->ctrl.device,
|
|
|
- "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
|
|
|
- csts, result);
|
|
|
-}
|
|
|
-
|
|
|
-static void nvme_watchdog_timer(unsigned long data)
|
|
|
-{
|
|
|
- struct nvme_dev *dev = (struct nvme_dev *)data;
|
|
|
- u32 csts = readl(dev->bar + NVME_REG_CSTS);
|
|
|
-
|
|
|
- /* Skip controllers under certain specific conditions. */
|
|
|
- if (nvme_should_reset(dev, csts)) {
|
|
|
- if (!nvme_reset(dev))
|
|
|
- nvme_warn_reset(dev, csts);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
|
|
|
-}
|
|
|
-
|
|
|
static int nvme_create_io_queues(struct nvme_dev *dev)
|
|
|
{
|
|
|
unsigned i, max;
|
|
@@ -1986,8 +1981,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
|
|
bool dead = true;
|
|
|
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
|
|
|
|
|
- del_timer_sync(&dev->watchdog_timer);
|
|
|
-
|
|
|
mutex_lock(&dev->shutdown_lock);
|
|
|
if (pci_is_enabled(pdev)) {
|
|
|
u32 csts = readl(dev->bar + NVME_REG_CSTS);
|
|
@@ -2163,8 +2156,6 @@ static void nvme_reset_work(struct work_struct *work)
|
|
|
if (dev->online_queues > 1)
|
|
|
nvme_queue_async_events(&dev->ctrl);
|
|
|
|
|
|
- mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
|
|
|
-
|
|
|
/*
|
|
|
* Keep the controller around but remove all namespaces if we don't have
|
|
|
* any working I/O queue.
|
|
@@ -2318,8 +2309,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
|
|
|
|
|
INIT_WORK(&dev->reset_work, nvme_reset_work);
|
|
|
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
|
|
|
- setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
|
|
|
- (unsigned long)dev);
|
|
|
mutex_init(&dev->shutdown_lock);
|
|
|
init_completion(&dev->ioq_wait);
|
|
|
|