|
@@ -39,12 +39,14 @@
|
|
|
#include <linux/slab.h>
|
|
|
#include <linux/io-mapping.h>
|
|
|
#include <linux/interrupt.h>
|
|
|
+#include <linux/delay.h>
|
|
|
#include <linux/mlx5/driver.h>
|
|
|
#include <linux/mlx5/cq.h>
|
|
|
#include <linux/mlx5/qp.h>
|
|
|
#include <linux/mlx5/srq.h>
|
|
|
#include <linux/debugfs.h>
|
|
|
#include <linux/kmod.h>
|
|
|
+#include <linux/delay.h>
|
|
|
#include <linux/mlx5/mlx5_ifc.h>
|
|
|
#include "mlx5_core.h"
|
|
|
|
|
@@ -151,6 +153,25 @@ static struct mlx5_profile profile[] = {
|
|
|
},
|
|
|
};
|
|
|
|
|
|
+#define FW_INIT_TIMEOUT_MILI 2000
|
|
|
+#define FW_INIT_WAIT_MS 2
|
|
|
+
|
|
|
+static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
|
|
|
+{
|
|
|
+ unsigned long end = jiffies + msecs_to_jiffies(max_wait_mili);
|
|
|
+ int err = 0;
|
|
|
+
|
|
|
+ while (fw_initializing(dev)) {
|
|
|
+ if (time_after(jiffies, end)) {
|
|
|
+ err = -EBUSY;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ msleep(FW_INIT_WAIT_MS);
|
|
|
+ }
|
|
|
+
|
|
|
+ return err;
|
|
|
+}
|
|
|
+
|
|
|
static int set_dma_caps(struct pci_dev *pdev)
|
|
|
{
|
|
|
int err;
|
|
@@ -181,6 +202,34 @@ static int set_dma_caps(struct pci_dev *pdev)
|
|
|
return err;
|
|
|
}
|
|
|
|
|
|
+static int mlx5_pci_enable_device(struct mlx5_core_dev *dev)
|
|
|
+{
|
|
|
+ struct pci_dev *pdev = dev->pdev;
|
|
|
+ int err = 0;
|
|
|
+
|
|
|
+ mutex_lock(&dev->pci_status_mutex);
|
|
|
+ if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) {
|
|
|
+ err = pci_enable_device(pdev);
|
|
|
+ if (!err)
|
|
|
+ dev->pci_status = MLX5_PCI_STATUS_ENABLED;
|
|
|
+ }
|
|
|
+ mutex_unlock(&dev->pci_status_mutex);
|
|
|
+
|
|
|
+ return err;
|
|
|
+}
|
|
|
+
|
|
|
+static void mlx5_pci_disable_device(struct mlx5_core_dev *dev)
|
|
|
+{
|
|
|
+ struct pci_dev *pdev = dev->pdev;
|
|
|
+
|
|
|
+ mutex_lock(&dev->pci_status_mutex);
|
|
|
+ if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) {
|
|
|
+ pci_disable_device(pdev);
|
|
|
+ dev->pci_status = MLX5_PCI_STATUS_DISABLED;
|
|
|
+ }
|
|
|
+ mutex_unlock(&dev->pci_status_mutex);
|
|
|
+}
|
|
|
+
|
|
|
static int request_bar(struct pci_dev *pdev)
|
|
|
{
|
|
|
int err = 0;
|
|
@@ -807,7 +856,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
if (!priv->dbg_root)
|
|
|
return -ENOMEM;
|
|
|
|
|
|
- err = pci_enable_device(pdev);
|
|
|
+ err = mlx5_pci_enable_device(dev);
|
|
|
if (err) {
|
|
|
dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
|
|
|
goto err_dbg;
|
|
@@ -841,7 +890,7 @@ err_clr_master:
|
|
|
pci_clear_master(dev->pdev);
|
|
|
release_bar(dev->pdev);
|
|
|
err_disable:
|
|
|
- pci_disable_device(dev->pdev);
|
|
|
+ mlx5_pci_disable_device(dev);
|
|
|
|
|
|
err_dbg:
|
|
|
debugfs_remove(priv->dbg_root);
|
|
@@ -853,7 +902,7 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
iounmap(dev->iseg);
|
|
|
pci_clear_master(dev->pdev);
|
|
|
release_bar(dev->pdev);
|
|
|
- pci_disable_device(dev->pdev);
|
|
|
+ mlx5_pci_disable_device(dev);
|
|
|
debugfs_remove(priv->dbg_root);
|
|
|
}
|
|
|
|
|
@@ -863,13 +912,32 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
struct pci_dev *pdev = dev->pdev;
|
|
|
int err;
|
|
|
|
|
|
+ mutex_lock(&dev->intf_state_mutex);
|
|
|
+ if (dev->interface_state == MLX5_INTERFACE_STATE_UP) {
|
|
|
+ dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
|
|
|
+ __func__);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
|
|
|
fw_rev_min(dev), fw_rev_sub(dev));
|
|
|
|
|
|
+ /* on load removing any previous indication of internal error, device is
|
|
|
+ * up
|
|
|
+ */
|
|
|
+ dev->state = MLX5_DEVICE_STATE_UP;
|
|
|
+
|
|
|
err = mlx5_cmd_init(dev);
|
|
|
if (err) {
|
|
|
dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
|
|
|
- return err;
|
|
|
+ goto out_err;
|
|
|
+ }
|
|
|
+
|
|
|
+ err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
|
|
|
+ if (err) {
|
|
|
+ dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
|
|
|
+ FW_INIT_TIMEOUT_MILI);
|
|
|
+ goto out_err;
|
|
|
}
|
|
|
|
|
|
mlx5_pagealloc_init(dev);
|
|
@@ -994,6 +1062,10 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
if (err)
|
|
|
pr_info("failed request module on %s\n", MLX5_IB_MOD);
|
|
|
|
|
|
+ dev->interface_state = MLX5_INTERFACE_STATE_UP;
|
|
|
+out:
|
|
|
+ mutex_unlock(&dev->intf_state_mutex);
|
|
|
+
|
|
|
return 0;
|
|
|
|
|
|
err_reg_dev:
|
|
@@ -1024,7 +1096,7 @@ err_stop_poll:
|
|
|
mlx5_stop_health_poll(dev);
|
|
|
if (mlx5_cmd_teardown_hca(dev)) {
|
|
|
dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
|
|
|
- return err;
|
|
|
+ goto out_err;
|
|
|
}
|
|
|
|
|
|
err_pagealloc_stop:
|
|
@@ -1040,13 +1112,23 @@ err_pagealloc_cleanup:
|
|
|
mlx5_pagealloc_cleanup(dev);
|
|
|
mlx5_cmd_cleanup(dev);
|
|
|
|
|
|
+out_err:
|
|
|
+ dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
|
|
+ mutex_unlock(&dev->intf_state_mutex);
|
|
|
+
|
|
|
return err;
|
|
|
}
|
|
|
|
|
|
static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
{
|
|
|
- int err;
|
|
|
+ int err = 0;
|
|
|
|
|
|
+ mutex_lock(&dev->intf_state_mutex);
|
|
|
+ if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) {
|
|
|
+ dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
|
|
|
+ __func__);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
mlx5_unregister_device(dev);
|
|
|
mlx5_cleanup_mr_table(dev);
|
|
|
mlx5_cleanup_srq_table(dev);
|
|
@@ -1072,10 +1154,12 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
|
|
|
mlx5_cmd_cleanup(dev);
|
|
|
|
|
|
out:
|
|
|
+ dev->interface_state = MLX5_INTERFACE_STATE_DOWN;
|
|
|
+ mutex_unlock(&dev->intf_state_mutex);
|
|
|
return err;
|
|
|
}
|
|
|
|
|
|
-static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
|
|
|
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
|
|
|
unsigned long param)
|
|
|
{
|
|
|
struct mlx5_priv *priv = &dev->priv;
|
|
@@ -1125,6 +1209,8 @@ static int init_one(struct pci_dev *pdev,
|
|
|
|
|
|
INIT_LIST_HEAD(&priv->ctx_list);
|
|
|
spin_lock_init(&priv->ctx_lock);
|
|
|
+ mutex_init(&dev->pci_status_mutex);
|
|
|
+ mutex_init(&dev->intf_state_mutex);
|
|
|
err = mlx5_pci_init(dev, priv);
|
|
|
if (err) {
|
|
|
dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
|
|
@@ -1172,6 +1258,112 @@ static void remove_one(struct pci_dev *pdev)
|
|
|
kfree(dev);
|
|
|
}
|
|
|
|
|
|
+static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
|
|
|
+ pci_channel_state_t state)
|
|
|
+{
|
|
|
+ struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
|
|
+ struct mlx5_priv *priv = &dev->priv;
|
|
|
+
|
|
|
+ dev_info(&pdev->dev, "%s was called\n", __func__);
|
|
|
+ mlx5_enter_error_state(dev);
|
|
|
+ mlx5_unload_one(dev, priv);
|
|
|
+ mlx5_pci_disable_device(dev);
|
|
|
+ return state == pci_channel_io_perm_failure ?
|
|
|
+ PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
|
|
|
+}
|
|
|
+
|
|
|
+static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
|
|
|
+{
|
|
|
+ struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
|
|
+ int err = 0;
|
|
|
+
|
|
|
+ dev_info(&pdev->dev, "%s was called\n", __func__);
|
|
|
+
|
|
|
+ err = mlx5_pci_enable_device(dev);
|
|
|
+ if (err) {
|
|
|
+ dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
|
|
|
+ , __func__, err);
|
|
|
+ return PCI_ERS_RESULT_DISCONNECT;
|
|
|
+ }
|
|
|
+ pci_set_master(pdev);
|
|
|
+ pci_set_power_state(pdev, PCI_D0);
|
|
|
+ pci_restore_state(pdev);
|
|
|
+
|
|
|
+ return err ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
|
|
|
+}
|
|
|
+
|
|
|
+void mlx5_disable_device(struct mlx5_core_dev *dev)
|
|
|
+{
|
|
|
+ mlx5_pci_err_detected(dev->pdev, 0);
|
|
|
+}
|
|
|
+
|
|
|
+/* wait for the device to show vital signs. For now we check
|
|
|
+ * that we can read the device ID and that the health buffer
|
|
|
+ * shows a non zero value which is different than 0xffffffff
|
|
|
+ */
|
|
|
+static void wait_vital(struct pci_dev *pdev)
|
|
|
+{
|
|
|
+ struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
|
|
+ struct mlx5_core_health *health = &dev->priv.health;
|
|
|
+ const int niter = 100;
|
|
|
+ u32 count;
|
|
|
+ u16 did;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ /* Wait for firmware to be ready after reset */
|
|
|
+ msleep(1000);
|
|
|
+ for (i = 0; i < niter; i++) {
|
|
|
+ if (pci_read_config_word(pdev, 2, &did)) {
|
|
|
+ dev_warn(&pdev->dev, "failed reading config word\n");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (did == pdev->device) {
|
|
|
+ dev_info(&pdev->dev, "device ID correctly read after %d iterations\n", i);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ msleep(50);
|
|
|
+ }
|
|
|
+ if (i == niter)
|
|
|
+ dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
|
|
|
+
|
|
|
+ for (i = 0; i < niter; i++) {
|
|
|
+ count = ioread32be(health->health_counter);
|
|
|
+ if (count && count != 0xffffffff) {
|
|
|
+ dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ msleep(50);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (i == niter)
|
|
|
+ dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
|
|
|
+}
|
|
|
+
|
|
|
+static void mlx5_pci_resume(struct pci_dev *pdev)
|
|
|
+{
|
|
|
+ struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
|
|
+ struct mlx5_priv *priv = &dev->priv;
|
|
|
+ int err;
|
|
|
+
|
|
|
+ dev_info(&pdev->dev, "%s was called\n", __func__);
|
|
|
+
|
|
|
+ pci_save_state(pdev);
|
|
|
+ wait_vital(pdev);
|
|
|
+
|
|
|
+ err = mlx5_load_one(dev, priv);
|
|
|
+ if (err)
|
|
|
+ dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
|
|
|
+ , __func__, err);
|
|
|
+ else
|
|
|
+ dev_info(&pdev->dev, "%s: device recovered\n", __func__);
|
|
|
+}
|
|
|
+
|
|
|
+static const struct pci_error_handlers mlx5_err_handler = {
|
|
|
+ .error_detected = mlx5_pci_err_detected,
|
|
|
+ .slot_reset = mlx5_pci_slot_reset,
|
|
|
+ .resume = mlx5_pci_resume
|
|
|
+};
|
|
|
+
|
|
|
static const struct pci_device_id mlx5_core_pci_table[] = {
|
|
|
{ PCI_VDEVICE(MELLANOX, 0x1011) }, /* Connect-IB */
|
|
|
{ PCI_VDEVICE(MELLANOX, 0x1012) }, /* Connect-IB VF */
|
|
@@ -1188,7 +1380,8 @@ static struct pci_driver mlx5_core_driver = {
|
|
|
.name = DRIVER_NAME,
|
|
|
.id_table = mlx5_core_pci_table,
|
|
|
.probe = init_one,
|
|
|
- .remove = remove_one
|
|
|
+ .remove = remove_one,
|
|
|
+ .err_handler = &mlx5_err_handler
|
|
|
};
|
|
|
|
|
|
static int __init init(void)
|