Skip to content

Commit 950bf4f

Browse files
Leon Romanovskyjgunthorpe
authored andcommitted
RDMA/mlx5: Fix access to wrong pointer while performing flush due to error
The main difference between send and receive SW completions is related to separate treatment of WQ queue. For receive completions, the initial index to be flushed is stored in "tail", while for send completions, it is in deleted "last_poll". CPU: 54 PID: 53405 Comm: kworker/u161:0 Kdump: loaded Tainted: G OE --------- -t - 4.18.0-147.el8.ppc64le #1 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] NIP: c000003c7c00a000 LR: c00800000e586af4 CTR: c000003c7c00a000 REGS: c0000036cc9db940 TRAP: 0400 Tainted: G OE --------- -t - (4.18.0-147.el8.ppc64le) MSR: 9000000010009033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 24004488 XER: 20040000 CFAR: c00800000e586af0 IRQMASK: 0 GPR00: c00800000e586ab4 c0000036cc9dbbc0 c00800000e5f1a00 c0000037d8433800 GPR04: c000003895a26800 c0000037293f2000 0000000000000201 0000000000000011 GPR08: c000003895a26c80 c000003c7c00a000 0000000000000000 c00800000ed30438 GPR12: c000003c7c00a000 c000003fff684b80 c00000000017c388 c00000396ec4be40 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: c00000000151e498 0000000000000010 c000003895a26848 0000000000000010 GPR24: 0000000000000010 0000000000010000 c000003895a26800 0000000000000000 GPR28: 0000000000000010 c0000037d8433800 c000003895a26c80 c000003895a26800 NIP [c000003c7c00a000] 0xc000003c7c00a000 LR [c00800000e586af4] __ib_process_cq+0xec/0x1b0 [ib_core] Call Trace: [c0000036cc9dbbc0] [c00800000e586ab4] __ib_process_cq+0xac/0x1b0 [ib_core] (unreliable) [c0000036cc9dbc40] [c00800000e586c88] ib_cq_poll_work+0x40/0xb0 [ib_core] [c0000036cc9dbc70] [c000000000171f44] process_one_work+0x2f4/0x5c0 [c0000036cc9dbd10] [c000000000172a0c] worker_thread+0xcc/0x760 [c0000036cc9dbdc0] [c00000000017c52c] kthread+0x1ac/0x1c0 [c0000036cc9dbe30] [c00000000000b75c] ret_from_kernel_thread+0x5c/0x80 Fixes: 8e3b688 ("RDMA/mlx5: Delete unreachable handle_atomic code by simplifying SW completion") Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Leon Romanovsky <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 2d47fba commit 950bf4f

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

drivers/infiniband/hw/mlx5/cq.c

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,22 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
330330
dump_cqe(dev, cqe);
331331
}
332332

333+
static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
334+
u16 tail, u16 head)
335+
{
336+
u16 idx;
337+
338+
do {
339+
idx = tail & (qp->sq.wqe_cnt - 1);
340+
if (idx == head)
341+
break;
342+
343+
tail = qp->sq.w_list[idx].next;
344+
} while (1);
345+
tail = qp->sq.w_list[idx].next;
346+
qp->sq.last_poll = tail;
347+
}
348+
333349
static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
334350
{
335351
mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
@@ -368,7 +384,7 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
368384
}
369385

370386
static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
371-
int *npolled, int is_send)
387+
int *npolled, bool is_send)
372388
{
373389
struct mlx5_ib_wq *wq;
374390
unsigned int cur;
@@ -383,10 +399,16 @@ static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
383399
return;
384400

385401
for (i = 0; i < cur && np < num_entries; i++) {
386-
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
402+
unsigned int idx;
403+
404+
idx = (is_send) ? wq->last_poll : wq->tail;
405+
idx &= (wq->wqe_cnt - 1);
406+
wc->wr_id = wq->wrid[idx];
387407
wc->status = IB_WC_WR_FLUSH_ERR;
388408
wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
389409
wq->tail++;
410+
if (is_send)
411+
wq->last_poll = wq->w_list[idx].next;
390412
np++;
391413
wc->qp = &qp->ibqp;
392414
wc++;
@@ -473,6 +495,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
473495
wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
474496
idx = wqe_ctr & (wq->wqe_cnt - 1);
475497
handle_good_req(wc, cqe64, wq, idx);
498+
handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
476499
wc->wr_id = wq->wrid[idx];
477500
wq->tail = wq->wqe_head[idx] + 1;
478501
wc->status = IB_WC_SUCCESS;

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ struct mlx5_ib_wq {
288288
unsigned head;
289289
unsigned tail;
290290
u16 cur_post;
291+
u16 last_poll;
291292
void *cur_edge;
292293
};
293294

drivers/infiniband/hw/mlx5/qp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3775,6 +3775,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
37753775
qp->sq.cur_post = 0;
37763776
if (qp->sq.wqe_cnt)
37773777
qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
3778+
qp->sq.last_poll = 0;
37783779
qp->db.db[MLX5_RCV_DBR] = 0;
37793780
qp->db.db[MLX5_SND_DBR] = 0;
37803781
}

0 commit comments

Comments
 (0)