]> git.baikalelectronics.ru Git - kernel.git/commitdiff
habanalabs: GAUDI does not support soft-reset
authorOded Gabbay <oded.gabbay@gmail.com>
Mon, 18 May 2020 13:48:01 +0000 (16:48 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Mon, 25 May 2020 05:15:33 +0000 (08:15 +0300)
GAUDI does not support soft-reset as it leaves the NIC ports in an awkward
state, where their QMANs were reset but the NIC itself is still working.

In addition, there is not much sense in doing soft-reset when training is
done on multiple GAUDIs.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
drivers/misc/habanalabs/device.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/sysfs.c

index 4b6c8de46dd8645158d808d88614de675bd55a15..4a4a446f479e883f1ccd662470f5aa79a61ca469 100644 (file)
@@ -801,6 +801,7 @@ static void device_hard_reset_pending(struct work_struct *work)
  * @hdev: pointer to habanalabs device structure
  * @hard_reset: should we do hard reset to all engines or just reset the
  *              compute/dma engines
+ * @from_hard_reset_thread: is the caller the hard-reset thread
  *
  * Block future CS and wait for pending CS to be enqueued
  * Call ASIC H/W fini
@@ -823,6 +824,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
                return 0;
        }
 
+       if ((!hard_reset) && (!hdev->supports_soft_reset)) {
+               dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+               hard_reset = true;
+       }
+
        /*
         * Prevent concurrency in this function - only one reset should be
         * done at any given time. Only need to perform this if we didn't
index 3d4a569914d3ce38ae272f9c8839bbf2762efce6..92a5130f06fbb8e3980bf993cfdac4a998a016e3 100644 (file)
@@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
                        >> EQ_CTL_EVENT_TYPE_SHIFT);
        u8 cause;
-       bool soft_reset_required;
+       bool reset_required;
 
        gaudi->events_stat[event_type]++;
        gaudi->events_stat_aggregate[event_type]++;
@@ -5840,16 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        case GAUDI_EVENT_TPC6_DEC:
        case GAUDI_EVENT_TPC7_DEC:
                gaudi_print_irq_info(hdev, event_type, true);
-               soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+               reset_required = gaudi_tpc_read_interrupts(hdev,
                                        tpc_dec_event_to_tpc_id(event_type),
                                        "AXI_SLV_DEC_Error");
-               if (soft_reset_required) {
-                       dev_err_ratelimited(hdev->dev,
-                                       "soft reset required due to %s\n",
-                                       gaudi_irq_map_table[event_type].name);
-                       hl_device_reset(hdev, false, false);
+               if (reset_required) {
+                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                               gaudi_irq_map_table[event_type].name);
+
+                       if (hdev->hard_reset_on_fw_events)
+                               hl_device_reset(hdev, true, false);
+               } else {
+                       hl_fw_unmask_irq(hdev, event_type);
                }
-               hl_fw_unmask_irq(hdev, event_type);
                break;
 
        case GAUDI_EVENT_TPC0_KRN_ERR:
@@ -5861,16 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
        case GAUDI_EVENT_TPC6_KRN_ERR:
        case GAUDI_EVENT_TPC7_KRN_ERR:
                gaudi_print_irq_info(hdev, event_type, true);
-               soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+               reset_required = gaudi_tpc_read_interrupts(hdev,
                                        tpc_krn_event_to_tpc_id(event_type),
                                        "KRN_ERR");
-               if (soft_reset_required) {
-                       dev_err_ratelimited(hdev->dev,
-                                       "soft reset required due to %s\n",
-                                       gaudi_irq_map_table[event_type].name);
-                       hl_device_reset(hdev, false, false);
+               if (reset_required) {
+                       dev_err(hdev->dev, "hard reset required due to %s\n",
+                               gaudi_irq_map_table[event_type].name);
+
+                       if (hdev->hard_reset_on_fw_events)
+                               hl_device_reset(hdev, true, false);
+               } else {
+                       hl_fw_unmask_irq(hdev, event_type);
                }
-               hl_fw_unmask_irq(hdev, event_type);
                break;
 
        case GAUDI_EVENT_PCIE_CORE_SERR:
@@ -5921,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
        case GAUDI_EVENT_RAZWI_OR_ADC_SW:
                gaudi_print_irq_info(hdev, event_type, true);
-               hl_device_reset(hdev, false, false);
-               hl_fw_unmask_irq(hdev, event_type);
+               if (hdev->hard_reset_on_fw_events)
+                       hl_device_reset(hdev, true, false);
                break;
 
        case GAUDI_EVENT_TPC0_BMON_SPMU:
index 15b6c3228e37ce91588f0adf9d627ac7b17acf49..152418dfe20c3dd71e6e3d649eab6cbcb68e1ba6 100644 (file)
@@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)
 
        spin_lock_init(&goya->hw_queues_lock);
        hdev->supports_coresight = true;
+       hdev->supports_soft_reset = true;
 
        return 0;
 
index 5a855b7edf4302a3b0e034fb9afa3422f75b45f2..0f0691875298d734061c4e1483779c000bb7c3fd 100644 (file)
@@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
  * @stop_on_err: true if engines should stop on error.
  * @supports_sync_stream: is sync stream supported.
  * @supports_coresight: is CoreSight supported.
+ * @supports_soft_reset: is soft reset supported.
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -1522,6 +1523,7 @@ struct hl_device {
        u8                              stop_on_err;
        u8                              supports_sync_stream;
        u8                              supports_coresight;
+       u8                              supports_soft_reset;
 
        /* Parameters for bring-up */
        u8                              mmu_enable;
index e4454414d0e10d00b754fdeda2f8eb81cbd4521d..5d78d5e1c7826163601cacdd12bac6a1a87e4e3e 100644 (file)
@@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev,
                goto out;
        }
 
+       if (!hdev->supports_soft_reset) {
+               dev_err(hdev->dev, "Device does not support soft-reset\n");
+               goto out;
+       }
+
        dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
 
        hl_device_reset(hdev, false, false);