]> git.baikalelectronics.ru Git - kernel.git/commitdiff
scsi: qla2xxx: Fix crash in NVMe abort path
authorArun Easi <aeasi@marvell.com>
Wed, 8 Sep 2021 16:46:16 +0000 (09:46 -0700)
committerMartin K. Petersen <martin.petersen@oracle.com>
Wed, 15 Sep 2021 03:33:19 +0000 (23:33 -0400)
System crash was seen when I/O was run against an NVMe target and aborts
were occurring.

Crash stack is:

    -- relevant crash stack --
    BUG: kernel NULL pointer dereference, address: 0000000000000010
    :
    #6 [ffffae1f8666bdd0] page_fault at ffffffffa740122e
       [exception RIP: qla_nvme_abort_work+339]
       RIP: ffffffffc0f592e3  RSP: ffffae1f8666be80  RFLAGS: 00010297
       RAX: 0000000000000000  RBX: ffff9b581fc8af80  RCX: ffffffffc0f83bd0
       RDX: 0000000000000001  RSI: ffff9b5839c6c7c8  RDI: 0000000008000000
       RBP: ffff9b6832f85000   R8: ffffffffc0f68160   R9: ffffffffc0f70652
       R10: ffffae1f862ffdc8  R11: 0000000000000300  R12: 000000000000010d
       R13: 0000000000000000  R14: ffff9b5839cea000  R15: 0ffff9b583fab170
       ORIG_RAX: ffffffffffffffff   CS: 0010  SS: 0018
    #7 [ffffae1f8666be98] process_one_work at ffffffffa6aba184
    #8 [ffffae1f8666bed8] worker_thread at ffffffffa6aba39d
    #9 [ffffae1f8666bf10] kthread at ffffffffa6ac06ed

The crash was due to a stale SRB structure access after it was aborted.
Fix the issue by removing stale access.

Link: https://lore.kernel.org/r/20210908164622.19240-5-njavali@marvell.com
Fixes: bb7549dd11a5 ("scsi: qla2xxx: Fix hang on NVMe command timeouts")
Cc: stable@vger.kernel.org
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Arun Easi <aeasi@marvell.com>
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/qla2xxx/qla_nvme.c

index 1c5da2dbd6f97897cceea5f72754307ac608a79e..877b2b625020471abfbd8e846bbd4caa61ed7f2b 100644 (file)
@@ -228,6 +228,8 @@ static void qla_nvme_abort_work(struct work_struct *work)
        fc_port_t *fcport = sp->fcport;
        struct qla_hw_data *ha = fcport->vha->hw;
        int rval, abts_done_called = 1;
+       bool io_wait_for_abort_done;
+       uint32_t handle;
 
        ql_dbg(ql_dbg_io, fcport->vha, 0xffff,
               "%s called for sp=%p, hndl=%x on fcport=%p desc=%p deleted=%d\n",
@@ -244,12 +246,20 @@ static void qla_nvme_abort_work(struct work_struct *work)
                goto out;
        }
 
+       /*
+        * sp may not be valid after abort_command if return code is either
+        * SUCCESS or ERR_FROM_FW codes, so cache the value here.
+        */
+       io_wait_for_abort_done = ql2xabts_wait_nvme &&
+                                       QLA_ABTS_WAIT_ENABLED(sp);
+       handle = sp->handle;
+
        rval = ha->isp_ops->abort_command(sp);
 
        ql_dbg(ql_dbg_io, fcport->vha, 0x212b,
            "%s: %s command for sp=%p, handle=%x on fcport=%p rval=%x\n",
            __func__, (rval != QLA_SUCCESS) ? "Failed to abort" : "Aborted",
-           sp, sp->handle, fcport, rval);
+           sp, handle, fcport, rval);
 
        /*
         * If async tmf is enabled, the abort callback is called only on
@@ -264,7 +274,7 @@ static void qla_nvme_abort_work(struct work_struct *work)
         * are waited until ABTS complete. This kref is decreased
         * at qla24xx_abort_sp_done function.
         */
-       if (abts_done_called && ql2xabts_wait_nvme && QLA_ABTS_WAIT_ENABLED(sp))
+       if (abts_done_called && io_wait_for_abort_done)
                return;
 out:
        /* kref_get was done before work was schedule. */