]> git.baikalelectronics.ru Git - kernel.git/commitdiff
net/mlx5: Check returned value from health recover sequence
authorLeon Romanovsky <leonro@nvidia.com>
Tue, 3 Nov 2020 16:46:31 +0000 (18:46 +0200)
committerSaeed Mahameed <saeedm@nvidia.com>
Thu, 11 Mar 2021 22:35:12 +0000 (14:35 -0800)
MLX5_INTERFACE_STATE_UP is far from being reliable check for success to
recover, because it can be changed any time and health logic doesn't
have any locks to protect from it.

The locks are not needed here because health recover is good to have,
but not must to success, so rely on the returned value from the
mlx5_recover_device() as a marker for success/failure.

Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
drivers/net/ethernet/mellanox/mlx5/core/health.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

index 0c32c485eb5887b82a2f71b1685fdba0f1552c56..a0a85164080436a0d49e4eaef83fcc6d643fc8dc 100644 (file)
@@ -335,12 +335,12 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
                return -EIO;
        }
        mlx5_core_err(dev, "starting health recovery flow\n");
-       mlx5_recover_device(dev);
-       if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
-           mlx5_health_check_fatal_sensors(dev)) {
+       if (mlx5_recover_device(dev) || mlx5_health_check_fatal_sensors(dev)) {
                mlx5_core_err(dev, "health recovery failed\n");
                return -EIO;
        }
+
+       mlx5_core_info(dev, "health revovery succeded\n");
        return 0;
 }
 
index 363bc3e917c20d3a9938261682a31f304bcf24f5..e3a417d177070fdc2507ae97f1a50064cba89f7b 100644 (file)
@@ -1721,11 +1721,14 @@ void mlx5_disable_device(struct mlx5_core_dev *dev)
        mlx5_unload_one(dev);
 }
 
-void mlx5_recover_device(struct mlx5_core_dev *dev)
+int mlx5_recover_device(struct mlx5_core_dev *dev)
 {
+       int ret = -EIO;
+
        mlx5_pci_disable_device(dev);
        if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
-               mlx5_pci_resume(dev->pdev);
+               ret = mlx5_load_one(dev);
+       return ret;
 }
 
 static struct pci_driver mlx5_core_driver = {
index 02993a51b114d6cc081325a3e13a7c2a2cd6dda6..37c8ec7d221718b827e5d6dc56fd39c582672f8c 100644 (file)
@@ -134,7 +134,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev);
 u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev);
 int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
-void mlx5_recover_device(struct mlx5_core_dev *dev);
+int mlx5_recover_device(struct mlx5_core_dev *dev);
 int mlx5_sriov_init(struct mlx5_core_dev *dev);
 void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
 int mlx5_sriov_attach(struct mlx5_core_dev *dev);