]> git.baikalelectronics.ru Git - kernel.git/commitdiff
nvme: fix regression when disconnect a recovering ctrl
authorRuozhu Li <liruozhu@huawei.com>
Thu, 23 Jun 2022 06:45:39 +0000 (14:45 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 21 Jul 2022 18:59:25 +0000 (20:59 +0200)
[ Upstream commit c28176d903104a5b1f28a70a53fb7eb8ad2e958a ]

We encountered a problem that the disconnect command hangs.
After analyzing the log and stack, we found that the triggering
process is as follows:
CPU0                          CPU1
                                nvme_rdma_error_recovery_work
                                  nvme_rdma_teardown_io_queues
nvme_do_delete_ctrl                 nvme_stop_queues
  nvme_remove_namespaces
  --clear ctrl->namespaces
                                    nvme_start_queues
                                    --no ns in ctrl->namespaces
    nvme_ns_remove                  return(because ctrl is deleting)
      blk_freeze_queue
        blk_mq_freeze_queue_wait
        --wait for ns to unquiesce to clean infligt IO, hang forever

This problem was not found in older kernels because we will flush
err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
seem to be modified for functional reasons, the patch can be revert
to solve the problem.

Revert commit ccc8e123ed2d ("nvme: remove the .stop_ctrl callout")

Signed-off-by: Ruozhu Li <liruozhu@huawei.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/nvme/host/core.c
drivers/nvme/host/nvme.h
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c

index 79e22618817de4c9e315233149ad84a6c7c786b0..d2ea6ca37c41f68a86f48699d65d708a8f989aac 100644 (file)
@@ -4034,6 +4034,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
        nvme_stop_keep_alive(ctrl);
        flush_work(&ctrl->async_event_work);
        cancel_work_sync(&ctrl->fw_act_work);
+       if (ctrl->ops->stop_ctrl)
+               ctrl->ops->stop_ctrl(ctrl);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
 
index 1d1431dd4f9e3b69b366fcf052d514e9592f3b55..81a5b968253ff2032374d3dde82bf7b8318d9f3a 100644 (file)
@@ -402,6 +402,7 @@ struct nvme_ctrl_ops {
        void (*free_ctrl)(struct nvme_ctrl *ctrl);
        void (*submit_async_event)(struct nvme_ctrl *ctrl);
        void (*delete_ctrl)(struct nvme_ctrl *ctrl);
+       void (*stop_ctrl)(struct nvme_ctrl *ctrl);
        int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 };
 
index 4213c71b02a4bf322d44511d403837ad1665cb46..d5d7b2f98edc91c27a526027dec4c620dc092f7a 100644 (file)
@@ -973,6 +973,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
        }
 }
 
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+       struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+       cancel_work_sync(&ctrl->err_work);
+       cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 {
        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -1947,9 +1955,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 
 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 {
-       cancel_work_sync(&ctrl->err_work);
-       cancel_delayed_work_sync(&ctrl->reconnect_work);
-
        nvme_rdma_teardown_io_queues(ctrl, shutdown);
        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
        if (shutdown)
@@ -1999,6 +2004,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
        .submit_async_event     = nvme_rdma_submit_async_event,
        .delete_ctrl            = nvme_rdma_delete_ctrl,
        .get_address            = nvmf_get_address,
+       .stop_ctrl              = nvme_rdma_stop_ctrl,
 };
 
 /*
index 4378344f0e7abac8c1762e0731419b073f00a051..2a27ac9aedbaa4f0968ae20f217ae7fd52188187 100644 (file)
@@ -1973,9 +1973,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 
 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 {
-       cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
-       cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
-
        nvme_tcp_teardown_io_queues(ctrl, shutdown);
        blk_mq_quiesce_queue(ctrl->admin_q);
        if (shutdown)
@@ -2014,6 +2011,12 @@ out_fail:
        nvme_tcp_reconnect_or_remove(ctrl);
 }
 
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+       cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+       cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
 {
        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@@ -2322,6 +2325,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
        .submit_async_event     = nvme_tcp_submit_async_event,
        .delete_ctrl            = nvme_tcp_delete_ctrl,
        .get_address            = nvmf_get_address,
+       .stop_ctrl              = nvme_tcp_stop_ctrl,
 };
 
 static bool