]> git.baikalelectronics.ru Git - kernel.git/commitdiff
RDMA/mlx5: Fix access to wrong pointer while performing flush due to error
authorLeon Romanovsky <leonro@mellanox.com>
Wed, 18 Mar 2020 09:16:40 +0000 (11:16 +0200)
committerJason Gunthorpe <jgg@mellanox.com>
Tue, 24 Mar 2020 22:54:57 +0000 (19:54 -0300)
The main difference between send and receive SW completions is related to
separate treatment of WQ queue. For receive completions, the initial index
to be flushed is stored in "tail", while for send completions, it is in
deleted "last_poll".

  CPU: 54 PID: 53405 Comm: kworker/u161:0 Kdump: loaded Tainted: G           OE    --------- -t - 4.18.0-147.el8.ppc64le #1
  Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core]
  NIP:  c000003c7c00a000 LR: c00800000e586af4 CTR: c000003c7c00a000
  REGS: c0000036cc9db940 TRAP: 0400   Tainted: G           OE    --------- -t -  (4.18.0-147.el8.ppc64le)
  MSR:  9000000010009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24004488  XER: 20040000
  CFAR: c00800000e586af0 IRQMASK: 0
  GPR00: c00800000e586ab4 c0000036cc9dbbc0 c00800000e5f1a00 c0000037d8433800
  GPR04: c000003895a26800 c0000037293f2000 0000000000000201 0000000000000011
  GPR08: c000003895a26c80 c000003c7c00a000 0000000000000000 c00800000ed30438
  GPR12: c000003c7c00a000 c000003fff684b80 c00000000017c388 c00000396ec4be40
  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR20: c00000000151e498 0000000000000010 c000003895a26848 0000000000000010
  GPR24: 0000000000000010 0000000000010000 c000003895a26800 0000000000000000
  GPR28: 0000000000000010 c0000037d8433800 c000003895a26c80 c000003895a26800
  NIP [c000003c7c00a000] 0xc000003c7c00a000
  LR [c00800000e586af4] __ib_process_cq+0xec/0x1b0 [ib_core]
  Call Trace:
  [c0000036cc9dbbc0] [c00800000e586ab4] __ib_process_cq+0xac/0x1b0 [ib_core] (unreliable)
  [c0000036cc9dbc40] [c00800000e586c88] ib_cq_poll_work+0x40/0xb0 [ib_core]
  [c0000036cc9dbc70] [c000000000171f44] process_one_work+0x2f4/0x5c0
  [c0000036cc9dbd10] [c000000000172a0c] worker_thread+0xcc/0x760
  [c0000036cc9dbdc0] [c00000000017c52c] kthread+0x1ac/0x1c0
  [c0000036cc9dbe30] [c00000000000b75c] ret_from_kernel_thread+0x5c/0x80

Fixes: 183c6ce37a2e ("RDMA/mlx5: Delete unreachable handle_atomic code by simplifying SW completion")
Link: https://lore.kernel.org/r/20200318091640.44069-1-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/qp.c

index 367a71bc5f4b628b97f5bc70e0e1efd910b100ba..3dec3de903b751054c68bd2f407d3c44bd1c4ef6 100644 (file)
@@ -330,6 +330,22 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
                dump_cqe(dev, cqe);
 }
 
+static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+                          u16 tail, u16 head)
+{
+       u16 idx;
+
+       do {
+               idx = tail & (qp->sq.wqe_cnt - 1);
+               if (idx == head)
+                       break;
+
+               tail = qp->sq.w_list[idx].next;
+       } while (1);
+       tail = qp->sq.w_list[idx].next;
+       qp->sq.last_poll = tail;
+}
+
 static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
 {
        mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
@@ -368,7 +384,7 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
 }
 
 static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
-                   int *npolled, int is_send)
+                   int *npolled, bool is_send)
 {
        struct mlx5_ib_wq *wq;
        unsigned int cur;
@@ -383,10 +399,16 @@ static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
                return;
 
        for (i = 0;  i < cur && np < num_entries; i++) {
-               wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+               unsigned int idx;
+
+               idx = (is_send) ? wq->last_poll : wq->tail;
+               idx &= (wq->wqe_cnt - 1);
+               wc->wr_id = wq->wrid[idx];
                wc->status = IB_WC_WR_FLUSH_ERR;
                wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
                wq->tail++;
+               if (is_send)
+                       wq->last_poll = wq->w_list[idx].next;
                np++;
                wc->qp = &qp->ibqp;
                wc++;
@@ -473,6 +495,7 @@ repoll:
                wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
                idx = wqe_ctr & (wq->wqe_cnt - 1);
                handle_good_req(wc, cqe64, wq, idx);
+               handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
                wc->wr_id = wq->wrid[idx];
                wq->tail = wq->wqe_head[idx] + 1;
                wc->status = IB_WC_SUCCESS;
index bb78142bca5eaf8a137785d144520ed391a3bbf4..f3bdbd5e50964efda778bbd69e0663ac55389de2 100644 (file)
@@ -288,6 +288,7 @@ struct mlx5_ib_wq {
        unsigned                head;
        unsigned                tail;
        u16                     cur_post;
+       u16                     last_poll;
        void                    *cur_edge;
 };
 
index 957f3a52589bef9026fadcee8c5601fc6aa0c2d8..c2967982f02c6863cddf4364a743932421230056 100644 (file)
@@ -3775,6 +3775,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                qp->sq.cur_post = 0;
                if (qp->sq.wqe_cnt)
                        qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
+               qp->sq.last_poll = 0;
                qp->db.db[MLX5_RCV_DBR] = 0;
                qp->db.db[MLX5_SND_DBR] = 0;
        }