|
@@ -61,10 +61,15 @@ enum {
|
|
|
enum {
|
|
|
MLX5_NIC_IFC_FULL = 0,
|
|
|
MLX5_NIC_IFC_DISABLED = 1,
|
|
|
- MLX5_NIC_IFC_NO_DRAM_NIC = 2
|
|
|
+ MLX5_NIC_IFC_NO_DRAM_NIC = 2,
|
|
|
+ MLX5_NIC_IFC_INVALID = 3
|
|
|
};
|
|
|
|
|
|
-static u8 get_nic_interface(struct mlx5_core_dev *dev)
|
|
|
+enum {
|
|
|
+ MLX5_DROP_NEW_HEALTH_WORK,
|
|
|
+};
|
|
|
+
|
|
|
+static u8 get_nic_state(struct mlx5_core_dev *dev)
|
|
|
{
|
|
|
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
|
|
|
}
|
|
@@ -97,7 +102,7 @@ static int in_fatal(struct mlx5_core_dev *dev)
|
|
|
struct mlx5_core_health *health = &dev->priv.health;
|
|
|
struct health_buffer __iomem *h = health->health;
|
|
|
|
|
|
- if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
|
|
|
+ if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
|
|
|
return 1;
|
|
|
|
|
|
if (ioread32be(&h->fw_ver) == 0xffffffff)
|
|
@@ -127,7 +132,7 @@ unlock:
|
|
|
|
|
|
static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
|
|
|
{
|
|
|
- u8 nic_interface = get_nic_interface(dev);
|
|
|
+ u8 nic_interface = get_nic_state(dev);
|
|
|
|
|
|
switch (nic_interface) {
|
|
|
case MLX5_NIC_IFC_FULL:
|
|
@@ -149,8 +154,34 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
|
|
|
mlx5_disable_device(dev);
|
|
|
}
|
|
|
|
|
|
+static void health_recover(struct work_struct *work)
|
|
|
+{
|
|
|
+ struct mlx5_core_health *health;
|
|
|
+ struct delayed_work *dwork;
|
|
|
+ struct mlx5_core_dev *dev;
|
|
|
+ struct mlx5_priv *priv;
|
|
|
+ u8 nic_state;
|
|
|
+
|
|
|
+ dwork = container_of(work, struct delayed_work, work);
|
|
|
+ health = container_of(dwork, struct mlx5_core_health, recover_work);
|
|
|
+ priv = container_of(health, struct mlx5_priv, health);
|
|
|
+ dev = container_of(priv, struct mlx5_core_dev, priv);
|
|
|
+
|
|
|
+ nic_state = get_nic_state(dev);
|
|
|
+ if (nic_state == MLX5_NIC_IFC_INVALID) {
|
|
|
+ dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ dev_err(&dev->pdev->dev, "starting health recovery flow\n");
|
|
|
+ mlx5_recover_device(dev);
|
|
|
+}
|
|
|
+
|
|
|
+/* How much time to wait until health resetting the driver (in msecs) */
|
|
|
+#define MLX5_RECOVERY_DELAY_MSECS 60000
|
|
|
static void health_care(struct work_struct *work)
|
|
|
{
|
|
|
+ unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
|
|
|
struct mlx5_core_health *health;
|
|
|
struct mlx5_core_dev *dev;
|
|
|
struct mlx5_priv *priv;
|
|
@@ -160,6 +191,14 @@ static void health_care(struct work_struct *work)
|
|
|
dev = container_of(priv, struct mlx5_core_dev, priv);
|
|
|
mlx5_core_warn(dev, "handling bad device here\n");
|
|
|
mlx5_handle_bad_state(dev);
|
|
|
+
|
|
|
+ spin_lock(&health->wq_lock);
|
|
|
+ if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
|
|
|
+ schedule_delayed_work(&health->recover_work, recover_delay);
|
|
|
+ else
|
|
|
+ dev_err(&dev->pdev->dev,
|
|
|
+ "new health works are not permitted at this stage\n");
|
|
|
+ spin_unlock(&health->wq_lock);
|
|
|
}
|
|
|
|
|
|
static const char *hsynd_str(u8 synd)
|
|
@@ -272,7 +311,13 @@ static void poll_health(unsigned long data)
|
|
|
if (in_fatal(dev) && !health->sick) {
|
|
|
health->sick = true;
|
|
|
print_health_info(dev);
|
|
|
- schedule_work(&health->work);
|
|
|
+ spin_lock(&health->wq_lock);
|
|
|
+ if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
|
|
|
+ queue_work(health->wq, &health->work);
|
|
|
+ else
|
|
|
+ dev_err(&dev->pdev->dev,
|
|
|
+ "new health works are not permitted at this stage\n");
|
|
|
+ spin_unlock(&health->wq_lock);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -281,6 +326,8 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
|
|
|
struct mlx5_core_health *health = &dev->priv.health;
|
|
|
|
|
|
init_timer(&health->timer);
|
|
|
+ health->sick = 0;
|
|
|
+ clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
|
|
|
health->health = &dev->iseg->health;
|
|
|
health->health_counter = &dev->iseg->health_counter;
|
|
|
|
|
@@ -297,11 +344,22 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
|
|
|
del_timer_sync(&health->timer);
|
|
|
}
|
|
|
|
|
|
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
|
|
|
+{
|
|
|
+ struct mlx5_core_health *health = &dev->priv.health;
|
|
|
+
|
|
|
+ spin_lock(&health->wq_lock);
|
|
|
+ set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
|
|
|
+ spin_unlock(&health->wq_lock);
|
|
|
+ cancel_delayed_work_sync(&health->recover_work);
|
|
|
+ cancel_work_sync(&health->work);
|
|
|
+}
|
|
|
+
|
|
|
void mlx5_health_cleanup(struct mlx5_core_dev *dev)
|
|
|
{
|
|
|
struct mlx5_core_health *health = &dev->priv.health;
|
|
|
|
|
|
- flush_work(&health->work);
|
|
|
+ destroy_workqueue(health->wq);
|
|
|
}
|
|
|
|
|
|
int mlx5_health_init(struct mlx5_core_dev *dev)
|
|
@@ -316,9 +374,13 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
|
|
|
|
|
|
strcpy(name, "mlx5_health");
|
|
|
strcat(name, dev_name(&dev->pdev->dev));
|
|
|
+ health->wq = create_singlethread_workqueue(name);
|
|
|
kfree(name);
|
|
|
-
|
|
|
+ if (!health->wq)
|
|
|
+ return -ENOMEM;
|
|
|
+ spin_lock_init(&health->wq_lock);
|
|
|
INIT_WORK(&health->work, health_care);
|
|
|
+ INIT_DELAYED_WORK(&health->recover_work, health_recover);
|
|
|
|
|
|
return 0;
|
|
|
}
|