When injecting EEH errors the port is getting hung up waiting on the node
list to empty, message number 0233. The driver is stuck at this point and
also can't unload. The driver makes transport remoteport delete calls which
try to abort I/O's, but the EEH daemon has already called the driver to
detach and the detachment has set the global FC_UNLOADING flag. There are
several code paths that will avoid I/O cleanup if the FC_UNLOADING flag is
set, resulting in transports waiting for I/O while the driver is waiting on
transports to clean up.
Additionally, during study of the list, a locking issue was found in
lpfc_sli_abort_iocb_ring that could corrupt the list.
A special case was added to the lpfc_cleanup() routine to call
lpfc_sli_flush_rings() if the driver is FC_UNLOADING and if the pci-slot
is offline (e.g. EEH).
The SLI4 part of lpfc_sli_abort_iocb_ring() is changed to use the
ring_lock. Also added code to cancel the I/Os if the pci-slot is offline
and added checks and returns for the FC_UNLOADING and HBA_IOQ_FLUSH flags
to prevent trying to send an I/O that we cannot handle.
Link: https://lore.kernel.org/r/20220317032737.45308-3-jsmart2021@gmail.com
Co-developed-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
ndlp->nlp_flag &= ~NLP_UNREG_INP;
mempool_free(mbox, phba->mbox_mem_pool);
acc_plogi = 1;
+ lpfc_nlp_put(ndlp);
}
} else {
lpfc_printf_vlog(vport, KERN_INFO,
static uint16_t lpfc_find_cpu_handle(struct lpfc_hba *, uint16_t, int);
static void lpfc_setup_bg(struct lpfc_hba *, struct Scsi_Host *);
static int lpfc_sli4_cgn_parm_chg_evt(struct lpfc_hba *);
+static void lpfc_sli4_prep_dev_for_reset(struct lpfc_hba *phba);
static struct scsi_transport_template *lpfc_transport_template = NULL;
static struct scsi_transport_template *lpfc_vport_transport_template = NULL;
if (pci_channel_offline(phba->pcidev)) {
lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
"3166 pci channel is offline\n");
+ lpfc_sli_flush_io_rings(phba);
return;
}
NLP_EVT_DEVICE_RM);
}
+ /* This is a special case flush to return all
+ * IOs before entering this loop. There are
+ * two points in the code where a flush is
+ * avoided if the FC_UNLOADING flag is set.
+ * one is in the multipool destroy,
+ * (this prevents a crash) and the other is
+ * in the nvme abort handler, ( also prevents
+ * a crash). Both of these exceptions are
+ * cases where the slot is still accessible.
+ * The flush here is only when the pci slot
+ * is offline.
+ */
+ if (vport->load_flag & FC_UNLOADING &&
+ pci_channel_offline(phba->pcidev))
+ lpfc_sli_flush_io_rings(vport->phba);
+
/* At this point, ALL ndlp's should be gone
* because of the previous NLP_EVT_DEVICE_RM.
* Lets wait for this to happen, if needed.
list_for_each_entry_safe(ndlp, next_ndlp,
&vport->fc_nodes, nlp_listp) {
lpfc_printf_vlog(ndlp->vport, KERN_ERR,
- LOG_TRACE_EVENT,
+ LOG_DISCOVERY,
"0282 did:x%x ndlp:x%px "
"refcnt:%d xflags x%x nflag x%x\n",
ndlp->nlp_DID, (void *)ndlp,
/* Abort all iocbs associated with the hba */
lpfc_sli_hba_iocb_abort(phba);
- /* Wait for completion of device XRI exchange busy */
- lpfc_sli4_xri_exchange_busy_wait(phba);
+ if (!pci_channel_offline(phba->pcidev))
+ /* Wait for completion of device XRI exchange busy */
+ lpfc_sli4_xri_exchange_busy_wait(phba);
/* per-phba callback de-registration for hotplug event */
if (phba->pport)
"2711 PCI channel permanent disable for failure\n");
/* Block all SCSI devices' I/Os on the host */
lpfc_scsi_dev_block(phba);
+ lpfc_sli4_prep_dev_for_reset(phba);
/* stop all timers */
lpfc_stop_hba_timers(phba);
lport = (struct lpfc_nvme_lport *)pnvme_lport->private;
vport = lport->vport;
+
+ if (!vport || vport->load_flag & FC_UNLOADING ||
+ vport->phba->hba_flag & HBA_IOQ_FLUSH)
+ return -ENODEV;
+
qhandle = kzalloc(sizeof(struct lpfc_nvme_qhandle), GFP_KERNEL);
if (qhandle == NULL)
return -ENOMEM;
return -EINVAL;
remoteport = lpfc_rport->remoteport;
- if (!vport->localport)
+ if (!vport->localport ||
+ vport->phba->hba_flag & HBA_IOQ_FLUSH)
return -EINVAL;
lport = vport->localport->private;
ndlp->nlp_DID, ntype, nstate);
return -ENODEV;
}
+ if (vport->phba->hba_flag & HBA_IOQ_FLUSH)
+ return -ENODEV;
if (!vport->phba->sli4_hba.nvmels_wq)
return -ENOMEM;
return -EINVAL;
vport = lport->vport;
- if (vport->load_flag & FC_UNLOADING)
+ if (vport->load_flag & FC_UNLOADING ||
+ vport->phba->hba_flag & HBA_IOQ_FLUSH)
return -ENODEV;
atomic_inc(&lport->fc4NvmeLsRequests);
phba = vport->phba;
- if (unlikely(vport->load_flag & FC_UNLOADING)) {
+ if ((unlikely(vport->load_flag & FC_UNLOADING)) ||
+ phba->hba_flag & HBA_IOQ_FLUSH) {
lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_IOERR,
"6124 Fail IO, Driver unload\n");
atomic_inc(&lport->xmt_fcp_err);
void
lpfc_sli_abort_iocb_ring(struct lpfc_hba *phba, struct lpfc_sli_ring *pring)
{
- LIST_HEAD(completions);
+ LIST_HEAD(tx_completions);
+ LIST_HEAD(txcmplq_completions);
struct lpfc_iocbq *iocb, *next_iocb;
+ int offline;
if (pring->ringno == LPFC_ELS_RING) {
lpfc_fabric_abort_hba(phba);
}
+ offline = pci_channel_offline(phba->pcidev);
/* Error everything on txq and txcmplq
* First do the txq.
*/
if (phba->sli_rev >= LPFC_SLI_REV4) {
spin_lock_irq(&pring->ring_lock);
- list_splice_init(&pring->txq, &completions);
+ list_splice_init(&pring->txq, &tx_completions);
pring->txq_cnt = 0;
- spin_unlock_irq(&pring->ring_lock);
- spin_lock_irq(&phba->hbalock);
- /* Next issue ABTS for everything on the txcmplq */
- list_for_each_entry_safe(iocb, next_iocb, &pring->txcmplq, list)
- lpfc_sli_issue_abort_iotag(phba, pring, iocb, NULL);
- spin_unlock_irq(&phba->hbalock);
+ if (offline) {
+ list_splice_init(&pring->txcmplq,
+ &txcmplq_completions);
+ } else {
+ /* Next issue ABTS for everything on the txcmplq */
+ list_for_each_entry_safe(iocb, next_iocb,
+ &pring->txcmplq, list)
+ lpfc_sli_issue_abort_iotag(phba, pring,
+ iocb, NULL);
+ }
+ spin_unlock_irq(&pring->ring_lock);
} else {
spin_lock_irq(&phba->hbalock);
- list_splice_init(&pring->txq, &completions);
+ list_splice_init(&pring->txq, &tx_completions);
pring->txq_cnt = 0;
- /* Next issue ABTS for everything on the txcmplq */
- list_for_each_entry_safe(iocb, next_iocb, &pring->txcmplq, list)
- lpfc_sli_issue_abort_iotag(phba, pring, iocb, NULL);
+ if (offline) {
+ list_splice_init(&pring->txcmplq, &txcmplq_completions);
+ } else {
+ /* Next issue ABTS for everything on the txcmplq */
+ list_for_each_entry_safe(iocb, next_iocb,
+ &pring->txcmplq, list)
+ lpfc_sli_issue_abort_iotag(phba, pring,
+ iocb, NULL);
+ }
spin_unlock_irq(&phba->hbalock);
}
- /* Make sure HBA is alive */
- lpfc_issue_hb_tmo(phba);
+ if (offline) {
+ /* Cancel all the IOCBs from the completions list */
+ lpfc_sli_cancel_iocbs(phba, &txcmplq_completions,
+ IOSTAT_LOCAL_REJECT, IOERR_SLI_ABORTED);
+ } else {
+ /* Make sure HBA is alive */
+ lpfc_issue_hb_tmo(phba);
+ }
/* Cancel all the IOCBs from the completions list */
- lpfc_sli_cancel_iocbs(phba, &completions, IOSTAT_LOCAL_REJECT,
+ lpfc_sli_cancel_iocbs(phba, &tx_completions, IOSTAT_LOCAL_REJECT,
IOERR_SLI_ABORTED);
}