scsi: qla2xxx: Wind down adapter after PCIe error

author Quinn Tran <qutran@marvell.com>

Thu, 16 Jun 2022 05:35:00 +0000 (22:35 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 17 Aug 2022 12:24:17 +0000 (14:24 +0200)
author Quinn Tran <qutran@marvell.com>
Thu, 16 Jun 2022 05:35:00 +0000 (22:35 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 17 Aug 2022 12:24:17 +0000 (14:24 +0200)
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c

index c636165be52be03a0aaa1f6a0e391e8ae1972a3d..3650f16cab6cf51cc5010a1494ec70ebfba166df 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -2972,6 +2972,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
  
         ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n",
             __func__, bsg_job);
+
+       if (qla2x00_isp_reg_stat(ha)) {
+               ql_log(ql_log_info, vha, 0x9007,
+                   "PCI/Register disconnect.\n");
+               qla_pci_set_eeh_busy(vha);
+       }
+
         /* find the bsg job from the active list of commands */
         spin_lock_irqsave(&ha->hardware_lock, flags);
         for (que = 0; que < ha->max_req_queues; que++) {
@@ -2989,7 +2996,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
                             sp->u.bsg_job == bsg_job) {
                                 req->outstanding_cmds[cnt] = NULL;
                                 spin_unlock_irqrestore(&ha->hardware_lock, flags);
-                               if (ha->isp_ops->abort_command(sp)) {
+
+                               if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) {
                                         ql_log(ql_log_warn, vha, 0x7089,
                                             "mbx abort_command failed.\n");
                                         bsg_reply->result = -EIO;
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h

index 15eeefc5e0a58a3e08229babfff0805f959c530c..51c7ce5f9792313153e7d269909b6805776596fd 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -4045,6 +4045,9 @@ struct qla_hw_data {
                 uint32_t        n2n_fw_acc_sec:1;
                 uint32_t        plogi_template_valid:1;
                 uint32_t        port_isolated:1;
+               uint32_t        eeh_flush:2;
+#define EEH_FLUSH_RDY  1
+#define EEH_FLUSH_DONE 2
         } flags;
  
         uint16_t max_exchg;
@@ -4079,6 +4082,7 @@ struct qla_hw_data {
         uint32_t                rsp_que_len;
         uint32_t                req_que_off;
         uint32_t                rsp_que_off;
+       unsigned long           eeh_jif;
  
         /* Multi queue data structs */
         device_reg_t *mqiobase;
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c

index de217037aaa74a874a9b0839a9c7d8069a6426d1..6ffae5565b9e190a849e689a4e6582b356db3819 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t)
  {
         srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer);
         struct srb_iocb *iocb;
+       scsi_qla_host_t *vha = sp->vha;
  
         WARN_ON(irqs_disabled());
         iocb = &sp->u.iocb_cmd;
@@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t)
  
         /* ref: TMR */
         kref_put(&sp->cmd_kref, qla2x00_sp_release);
+
+       if (vha && qla2x00_isp_reg_stat(vha->hw)) {
+               ql_log(ql_log_info, vha, 0x9008,
+                   "PCI/Register disconnect.\n");
+               qla_pci_set_eeh_busy(vha);
+       }
  }
  
  void qla2x00_sp_free(srb_t *sp)
@@ -9731,6 +9738,12 @@ int qla2xxx_disable_port(struct Scsi_Host *host)
  
         vha->hw->flags.port_isolated = 1;
  
+       if (qla2x00_isp_reg_stat(vha->hw)) {
+               ql_log(ql_log_info, vha, 0x9006,
+                   "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
+               return FAILED;
+       }
         if (qla2x00_chip_is_down(vha))
                 return 0;
  
@@ -9746,6 +9759,13 @@ int qla2xxx_enable_port(struct Scsi_Host *host)
  {
         scsi_qla_host_t *vha = shost_priv(host);
  
+       if (qla2x00_isp_reg_stat(vha->hw)) {
+               ql_log(ql_log_info, vha, 0x9001,
+                   "PCI/Register disconnect, exiting.\n");
+               qla_pci_set_eeh_busy(vha);
+               return FAILED;
+       }
+
         vha->hw->flags.port_isolated = 0;
         /* Set the flag to 1, so that isp_abort can proceed */
         vha->flags.online = 1;
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c

index 7e5d8702c9f793a92a9dc82a357d3a1a84ff1f07..6542a258cb75135a9732ddf278f0e1f01e5c1bb9 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme,
                  "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)");
  
  
+u32 ql2xdelay_before_pci_error_handling = 5;
+module_param(ql2xdelay_before_pci_error_handling, uint, 0644);
+MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling,
+       "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n");
+
  static void qla2x00_clear_drv_active(struct qla_hw_data *);
  static void qla2x00_free_device(scsi_qla_host_t *);
  static int qla2xxx_map_queues(struct Scsi_Host *shost);
@@ -7251,6 +7256,44 @@ static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started)
         }
  }
  
+static void qla_wind_down_chip(scsi_qla_host_t *vha)
+{
+       struct qla_hw_data *ha = vha->hw;
+
+       if (!ha->flags.eeh_busy)
+               return;
+       if (ha->pci_error_state)
+               /* system is trying to recover */
+               return;
+
+       /*
+        * Current system is not handling PCIE error.  At this point, this is
+        * best effort to wind down the adapter.
+        */
+       if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) &&
+           !ha->flags.eeh_flush) {
+               ql_log(ql_log_info, vha, 0x9009,
+                   "PCI Error detected, attempting to reset hardware.\n");
+
+               ha->isp_ops->reset_chip(vha);
+               ha->isp_ops->disable_intrs(ha);
+
+               ha->flags.eeh_flush = EEH_FLUSH_RDY;
+               ha->eeh_jif = jiffies;
+
+       } else if (ha->flags.eeh_flush == EEH_FLUSH_RDY &&
+           time_after_eq(jiffies, ha->eeh_jif +  5 * HZ)) {
+               pci_clear_master(ha->pdev);
+
+               /* flush all command */
+               qla2x00_abort_isp_cleanup(vha);
+               ha->flags.eeh_flush = EEH_FLUSH_DONE;
+
+               ql_log(ql_log_info, vha, 0x900a,
+                   "PCI Error handling complete, all IOs aborted.\n");
+       }
+}
+
  /**************************************************************************
  *   qla2x00_timer
  *
@@ -7274,6 +7317,8 @@ qla2x00_timer(struct timer_list *t)
         fc_port_t *fcport = NULL;
  
         if (ha->flags.eeh_busy) {
+               qla_wind_down_chip(vha);
+
                 ql_dbg(ql_dbg_timer, vha, 0x6000,
                     "EEH = %d, restarting timer.\n",
                     ha->flags.eeh_busy);
@@ -7854,6 +7899,9 @@ void qla_pci_set_eeh_busy(struct scsi_qla_host *vha)
  
         spin_lock_irqsave(&base_vha->work_lock, flags);
         if (!ha->flags.eeh_busy) {
+               ha->eeh_jif = jiffies;
+               ha->flags.eeh_flush = 0;
+
                 ha->flags.eeh_busy = 1;
                 do_cleanup = true;
         }
author	Quinn Tran <qutran@marvell.com>
	Thu, 16 Jun 2022 05:35:00 +0000 (22:35 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 17 Aug 2022 12:24:17 +0000 (14:24 +0200)
drivers/scsi/qla2xxx/qla_bsg.c		patch \| blob \| history
drivers/scsi/qla2xxx/qla_def.h		patch \| blob \| history
drivers/scsi/qla2xxx/qla_init.c		patch \| blob \| history
drivers/scsi/qla2xxx/qla_os.c		patch \| blob \| history