]> git.baikalelectronics.ru Git - kernel.git/commitdiff
accel/habanalabs: add pci health check during heartbeat
authorOfir Bitton <obitton@habana.ai>
Tue, 18 Apr 2023 11:48:22 +0000 (14:48 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 23 Aug 2023 15:52:21 +0000 (17:52 +0200)
[ Upstream commit d8b9cea584661b30305cf341bf9f675dc0a25471 ]

Currently upon a heartbeat failure, we don't know if the failure
is due to firmware hang or due to a bad PCI link. Hence, we
are reading a PCI config space register with a known value (vendor ID)
so we will know which of the two possibilities caused the heartbeat
failure.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_drv.c

index e0dca445abf14bbdfc622baadaad1365f8192ad9..9ee1b6abd8a051ed33d6f8c83df4642ae4237c28 100644 (file)
@@ -870,6 +870,18 @@ static void device_early_fini(struct hl_device *hdev)
                hdev->asic_funcs->early_fini(hdev);
 }
 
+static bool is_pci_link_healthy(struct hl_device *hdev)
+{
+       u16 vendor_id;
+
+       if (!hdev->pdev)
+               return false;
+
+       pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+
+       return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+}
+
 static void hl_device_heartbeat(struct work_struct *work)
 {
        struct hl_device *hdev = container_of(work, struct hl_device,
@@ -882,7 +894,8 @@ static void hl_device_heartbeat(struct work_struct *work)
                goto reschedule;
 
        if (hl_device_operational(hdev, NULL))
-               dev_err(hdev->dev, "Device heartbeat failed!\n");
+               dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
+                       is_pci_link_healthy(hdev) ? "healthy" : "broken");
 
        hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
 
index 58c95b13be69a4706ab9f75144ca6be7d55fa55d..257b94cec6248bceb392fe99c41307f9b62ac40a 100644 (file)
@@ -34,6 +34,8 @@
 struct hl_device;
 struct hl_fpriv;
 
+#define PCI_VENDOR_ID_HABANALABS       0x1da3
+
 /* Use upper bits of mmap offset to store habana driver specific information.
  * bits[63:59] - Encode mmap type
  * bits[45:0]  - mmap offset value
index 112632afe7d53806be61dd058fa0aba86647e7ba..ae3cab3f4aa55a7b4fda2c8b070b502afbea72ae 100644 (file)
@@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
 MODULE_PARM_DESC(boot_error_status_mask,
        "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
 
-#define PCI_VENDOR_ID_HABANALABS       0x1da3
-
 #define PCI_IDS_GOYA                   0x0001
 #define PCI_IDS_GAUDI                  0x1000
 #define PCI_IDS_GAUDI_SEC              0x1010