Skip to content

Commit 39b169e

Browse files
SergeyGorenkojgunthorpe
authored andcommitted
IB/iser: Fix RNR errors
Some users complain about RNR errors on the target, when heavy high-priority tasks run on the initiator. After the investigation, we found out that the receive WRs were exhausted, because the initiator could not post them on time. Receive work reqeusts are posted in chunks to reduce the number of hits to the HCA. The WRs are posted in the receive completion handler when the number of free receive buffers reaches the threshold. But on a high-loaded host, receive CQEs processing can be delayed and all receive WRs will be exhausted. In this case, the target will get an RNR error. To avoid this, we post receive WR, as soon as possible and not in a batch. This increases the number of hits to the HCA, but also the common implementation in most of Linux ULPs (e.g. NVMe-oF/RDMA). As a rule of thumb, performance improvements and heuristics are being added to the RDMA core layer or vendors low level drivers and it's about time to align iSER as well. Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Sergey Gorenko <[email protected]> Signed-off-by: Max Gurtovoy <[email protected]> Reviewed-by: Israel Rukshin <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent b28801a commit 39b169e

File tree

3 files changed

+42
-78
lines changed

3 files changed

+42
-78
lines changed

drivers/infiniband/ulp/iser/iscsi_iser.h

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,6 @@
119119

120120
#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX)
121121

122-
#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2)
123-
124122
/* the max TX (send) WR supported by the iSER QP is defined by *
125123
* max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect *
126124
* to have at max for SCSI command. The tx posting & completion handling code *
@@ -366,9 +364,7 @@ struct iser_fr_pool {
366364
* @qp: Connection Queue-pair
367365
* @cq: Connection completion queue
368366
* @cq_size: The number of max outstanding completions
369-
* @post_recv_buf_count: post receive counter
370367
* @sig_count: send work request signal count
371-
* @rx_wr: receive work request for batch posts
372368
* @device: reference to iser device
373369
* @fr_pool: connection fast registration poool
374370
* @pi_support: Indicate device T10-PI support
@@ -379,9 +375,7 @@ struct ib_conn {
379375
struct ib_qp *qp;
380376
struct ib_cq *cq;
381377
u32 cq_size;
382-
int post_recv_buf_count;
383378
u8 sig_count;
384-
struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
385379
struct iser_device *device;
386380
struct iser_fr_pool fr_pool;
387381
bool pi_support;
@@ -397,8 +391,6 @@ struct ib_conn {
397391
* @state: connection logical state
398392
* @qp_max_recv_dtos: maximum number of data outs, corresponds
399393
* to max number of post recvs
400-
* @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
401-
* @min_posted_rx: (qp_max_recv_dtos >> 2)
402394
* @max_cmds: maximum cmds allowed for this connection
403395
* @name: connection peer portal
404396
* @release_work: deffered work for release job
@@ -409,7 +401,6 @@ struct ib_conn {
409401
* (state is ISER_CONN_UP)
410402
* @conn_list: entry in ig conn list
411403
* @login_desc: login descriptor
412-
* @rx_desc_head: head of rx_descs cyclic buffer
413404
* @rx_descs: rx buffers array (cyclic buffer)
414405
* @num_rx_descs: number of rx descriptors
415406
* @scsi_sg_tablesize: scsi host sg_tablesize
@@ -422,8 +413,6 @@ struct iser_conn {
422413
struct iscsi_endpoint *ep;
423414
enum iser_conn_state state;
424415
unsigned qp_max_recv_dtos;
425-
unsigned qp_max_recv_dtos_mask;
426-
unsigned min_posted_rx;
427416
u16 max_cmds;
428417
char name[ISER_OBJECT_NAME_SIZE];
429418
struct work_struct release_work;
@@ -433,7 +422,6 @@ struct iser_conn {
433422
struct completion up_completion;
434423
struct list_head conn_list;
435424
struct iser_login_desc login_desc;
436-
unsigned int rx_desc_head;
437425
struct iser_rx_desc *rx_descs;
438426
u32 num_rx_descs;
439427
unsigned short scsi_sg_tablesize;
@@ -542,7 +530,8 @@ int iser_connect(struct iser_conn *iser_conn,
542530
int non_blocking);
543531

544532
int iser_post_recvl(struct iser_conn *iser_conn);
545-
int iser_post_recvm(struct iser_conn *iser_conn, int count);
533+
int iser_post_recvm(struct iser_conn *iser_conn,
534+
struct iser_rx_desc *rx_desc);
546535
int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
547536
bool signal);
548537

drivers/infiniband/ulp/iser/iser_initiator.c

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
247247
struct iser_device *device = ib_conn->device;
248248

249249
iser_conn->qp_max_recv_dtos = session->cmds_max;
250-
iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
251-
iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
252250

253251
if (iser_alloc_fastreg_pool(ib_conn, session->scsi_cmds_max,
254252
iser_conn->pages_per_mr))
@@ -280,7 +278,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
280278
rx_sg->lkey = device->pd->local_dma_lkey;
281279
}
282280

283-
iser_conn->rx_desc_head = 0;
284281
return 0;
285282

286283
rx_desc_dma_map_failed:
@@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
322319
static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
323320
{
324321
struct iser_conn *iser_conn = conn->dd_data;
325-
struct ib_conn *ib_conn = &iser_conn->ib_conn;
326322
struct iscsi_session *session = conn->session;
323+
int err = 0;
324+
int i;
327325

328326
iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
329327
/* check if this is the last login - going to full feature phase */
330328
if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
331-
return 0;
332-
333-
/*
334-
* Check that there is one posted recv buffer
335-
* (for the last login response).
336-
*/
337-
WARN_ON(ib_conn->post_recv_buf_count != 1);
329+
goto out;
338330

339331
if (session->discovery_sess) {
340332
iser_info("Discovery session, re-using login RX buffer\n");
341-
return 0;
342-
} else
343-
iser_info("Normal session, posting batch of RX %d buffers\n",
344-
iser_conn->min_posted_rx);
333+
goto out;
334+
}
345335

346-
/* Initial post receive buffers */
347-
if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
348-
return -ENOMEM;
336+
iser_info("Normal session, posting batch of RX %d buffers\n",
337+
iser_conn->qp_max_recv_dtos - 1);
349338

350-
return 0;
339+
/*
340+
* Initial post receive buffers.
341+
* There is one already posted recv buffer (for the last login
342+
* response). Therefore, the first recv buffer is skipped here.
343+
*/
344+
for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) {
345+
err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]);
346+
if (err)
347+
goto out;
348+
}
349+
out:
350+
return err;
351351
}
352352

353353
static inline bool iser_signal_comp(u8 sig_count)
@@ -590,7 +590,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
590590
desc->rsp_dma, ISER_RX_LOGIN_SIZE,
591591
DMA_FROM_DEVICE);
592592

593-
ib_conn->post_recv_buf_count--;
593+
if (iser_conn->iscsi_conn->session->discovery_sess)
594+
return;
595+
596+
/* Post the first RX buffer that is skipped in iser_post_rx_bufs() */
597+
iser_post_recvm(iser_conn, iser_conn->rx_descs);
594598
}
595599

596600
static inline int
@@ -657,8 +661,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
657661
struct iser_conn *iser_conn = to_iser_conn(ib_conn);
658662
struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
659663
struct iscsi_hdr *hdr;
660-
int length;
661-
int outstanding, count, err;
664+
int length, err;
662665

663666
if (unlikely(wc->status != IB_WC_SUCCESS)) {
664667
iser_err_comp(wc, "task_rsp");
@@ -687,20 +690,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
687690
desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
688691
DMA_FROM_DEVICE);
689692

690-
/* decrementing conn->post_recv_buf_count only --after-- freeing the *
691-
* task eliminates the need to worry on tasks which are completed in *
692-
* parallel to the execution of iser_conn_term. So the code that waits *
693-
* for the posted rx bufs refcount to become zero handles everything */
694-
ib_conn->post_recv_buf_count--;
695-
696-
outstanding = ib_conn->post_recv_buf_count;
697-
if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
698-
count = min(iser_conn->qp_max_recv_dtos - outstanding,
699-
iser_conn->min_posted_rx);
700-
err = iser_post_recvm(iser_conn, count);
701-
if (err)
702-
iser_err("posting %d rx bufs err %d\n", count, err);
703-
}
693+
err = iser_post_recvm(iser_conn, desc);
694+
if (err)
695+
iser_err("posting rx buffer err %d\n", err);
704696
}
705697

706698
void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)

drivers/infiniband/ulp/iser/iser_verbs.c

Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
757757
INIT_LIST_HEAD(&iser_conn->conn_list);
758758
mutex_init(&iser_conn->state_mutex);
759759

760-
ib_conn->post_recv_buf_count = 0;
761760
ib_conn->reg_cqe.done = iser_reg_comp;
762761
}
763762

@@ -841,44 +840,28 @@ int iser_post_recvl(struct iser_conn *iser_conn)
841840
wr.num_sge = 1;
842841
wr.next = NULL;
843842

844-
ib_conn->post_recv_buf_count++;
845843
ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL);
846-
if (ib_ret) {
847-
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
848-
ib_conn->post_recv_buf_count--;
849-
}
844+
if (unlikely(ib_ret))
845+
iser_err("ib_post_recv login failed ret=%d\n", ib_ret);
850846

851847
return ib_ret;
852848
}
853849

854-
int iser_post_recvm(struct iser_conn *iser_conn, int count)
850+
int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc)
855851
{
856852
struct ib_conn *ib_conn = &iser_conn->ib_conn;
857-
unsigned int my_rx_head = iser_conn->rx_desc_head;
858-
struct iser_rx_desc *rx_desc;
859-
struct ib_recv_wr *wr;
860-
int i, ib_ret;
861-
862-
for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
863-
rx_desc = &iser_conn->rx_descs[my_rx_head];
864-
rx_desc->cqe.done = iser_task_rsp;
865-
wr->wr_cqe = &rx_desc->cqe;
866-
wr->sg_list = &rx_desc->rx_sg;
867-
wr->num_sge = 1;
868-
wr->next = wr + 1;
869-
my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
870-
}
853+
struct ib_recv_wr wr;
854+
int ib_ret;
871855

872-
wr--;
873-
wr->next = NULL; /* mark end of work requests list */
856+
rx_desc->cqe.done = iser_task_rsp;
857+
wr.wr_cqe = &rx_desc->cqe;
858+
wr.sg_list = &rx_desc->rx_sg;
859+
wr.num_sge = 1;
860+
wr.next = NULL;
874861

875-
ib_conn->post_recv_buf_count += count;
876-
ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, NULL);
877-
if (unlikely(ib_ret)) {
862+
ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL);
863+
if (unlikely(ib_ret))
878864
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
879-
ib_conn->post_recv_buf_count -= count;
880-
} else
881-
iser_conn->rx_desc_head = my_rx_head;
882865

883866
return ib_ret;
884867
}

0 commit comments

Comments
 (0)