Skip to content

Commit 10e6fc1

Browse files
committed
svcrdma: Post the Reply chunk and Send WR together
Reduce the doorbell and Send completion rates when sending RPC/RDMA replies that have Reply chunks. NFS READDIR procedures typically return their result in a Reply chunk, for example. Instead of calling ib_post_send() to post the Write WRs for the Reply chunk, and then calling it again to post the Send WR that conveys the transport header, chain the Write WRs to the Send WR and call ib_post_send() only once. Thanks to the Send Queue completion ordering rules, when the Send WR completes, that guarantees that Write WRs posted before it have also completed successfully. Thus all Write WRs for the Reply chunk can remain unsignaled. Instead of handling a Write completion and then a Send completion, only the Send completion is seen, and it handles clean up for both the Writes and the Send. Signed-off-by: Chuck Lever <[email protected]>
1 parent a1f5788 commit 10e6fc1

File tree

3 files changed

+66
-39
lines changed

3 files changed

+66
-39
lines changed

include/linux/sunrpc/svc_rdma.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,19 +262,24 @@ extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt);
262262
extern int svc_rdma_recvfrom(struct svc_rqst *);
263263

264264
/* svc_rdma_rw.c */
265+
extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
266+
struct svc_rdma_chunk_ctxt *cc);
265267
extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
266268
extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
267269
struct svc_rdma_chunk_ctxt *cc);
268270
extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
269271
struct svc_rdma_chunk_ctxt *cc,
270272
enum dma_data_direction dir);
273+
extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
274+
struct svc_rdma_send_ctxt *ctxt);
271275
extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
272276
const struct svc_rdma_chunk *chunk,
273277
const struct xdr_buf *xdr);
274-
extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
275-
const struct svc_rdma_recv_ctxt *rctxt,
276-
struct svc_rdma_send_ctxt *sctxt,
277-
const struct xdr_buf *xdr);
278+
extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
279+
const struct svc_rdma_pcl *write_pcl,
280+
const struct svc_rdma_pcl *reply_pcl,
281+
struct svc_rdma_send_ctxt *sctxt,
282+
const struct xdr_buf *xdr);
278283
extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
279284
struct svc_rqst *rqstp,
280285
struct svc_rdma_recv_ctxt *head);

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -230,10 +230,18 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
230230
queue_work(svcrdma_wq, &info->wi_work);
231231
}
232232

233-
static void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
234-
struct svc_rdma_chunk_ctxt *cc)
233+
/**
234+
* svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
235+
* @rdma: controlling transport
236+
* @ctxt: Send context that is being released
237+
*/
238+
void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
239+
struct svc_rdma_send_ctxt *ctxt)
235240
{
236-
svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
241+
struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc;
242+
243+
if (!cc->cc_sqecount)
244+
return;
237245
svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE);
238246
}
239247

@@ -254,7 +262,6 @@ static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
254262
switch (wc->status) {
255263
case IB_WC_SUCCESS:
256264
trace_svcrdma_wc_reply(&cc->cc_cid);
257-
svc_rdma_reply_chunk_release(rdma, cc);
258265
return;
259266
case IB_WC_WR_FLUSH_ERR:
260267
trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid);
@@ -263,7 +270,6 @@ static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc)
263270
trace_svcrdma_wc_reply_err(wc, &cc->cc_cid);
264271
}
265272

266-
svc_rdma_reply_chunk_release(rdma, cc);
267273
svc_xprt_deferred_close(&rdma->sc_xprt);
268274
}
269275

@@ -637,9 +643,10 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
637643
}
638644

639645
/**
640-
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
646+
* svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk
641647
* @rdma: controlling RDMA transport
642-
* @rctxt: Write and Reply chunks provisioned by the client
648+
* @write_pcl: Write chunk list provided by client
649+
* @reply_pcl: Reply chunk provided by client
643650
* @sctxt: Send WR resources
644651
* @xdr: xdr_buf containing an RPC Reply
645652
*
@@ -650,35 +657,44 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
650657
* %-ENOTCONN if posting failed (connection is lost),
651658
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
652659
*/
653-
int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
654-
const struct svc_rdma_recv_ctxt *rctxt,
655-
struct svc_rdma_send_ctxt *sctxt,
656-
const struct xdr_buf *xdr)
660+
int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
661+
const struct svc_rdma_pcl *write_pcl,
662+
const struct svc_rdma_pcl *reply_pcl,
663+
struct svc_rdma_send_ctxt *sctxt,
664+
const struct xdr_buf *xdr)
657665
{
658666
struct svc_rdma_write_info *info = &sctxt->sc_reply_info;
659667
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
668+
struct ib_send_wr *first_wr;
669+
struct list_head *pos;
670+
struct ib_cqe *cqe;
660671
int ret;
661672

662-
if (likely(pcl_is_empty(&rctxt->rc_reply_pcl)))
663-
return 0; /* client provided no Reply chunk */
664-
665673
info->wi_rdma = rdma;
666-
info->wi_chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
674+
info->wi_chunk = pcl_first_chunk(reply_pcl);
667675
info->wi_seg_off = 0;
668676
info->wi_seg_no = 0;
669-
svc_rdma_cc_init(rdma, &info->wi_cc);
670677
info->wi_cc.cc_cqe.done = svc_rdma_reply_done;
671678

672-
ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
679+
ret = pcl_process_nonpayloads(write_pcl, xdr,
673680
svc_rdma_xb_write, info);
674681
if (ret < 0)
675682
return ret;
676683

677-
trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
678-
ret = svc_rdma_post_chunk_ctxt(rdma, cc);
679-
if (ret < 0)
680-
return ret;
684+
first_wr = sctxt->sc_wr_chain;
685+
cqe = &cc->cc_cqe;
686+
list_for_each(pos, &cc->cc_rwctxts) {
687+
struct svc_rdma_rw_ctxt *rwc;
681688

689+
rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
690+
first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
691+
rdma->sc_port_num, cqe, first_wr);
692+
cqe = NULL;
693+
}
694+
sctxt->sc_wr_chain = first_wr;
695+
sctxt->sc_sqecount += cc->cc_sqecount;
696+
697+
trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
682698
return xdr->len;
683699
}
684700

net/sunrpc/xprtrdma/svc_rdma_sendto.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
205205
xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
206206
ctxt->sc_xprt_buf, NULL);
207207

208+
svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc);
208209
ctxt->sc_send_wr.num_sge = 0;
209210
ctxt->sc_cur_sge_no = 0;
210211
ctxt->sc_page_count = 0;
@@ -226,6 +227,8 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
226227
struct ib_device *device = rdma->sc_cm_id->device;
227228
unsigned int i;
228229

230+
svc_rdma_reply_chunk_release(rdma, ctxt);
231+
229232
if (ctxt->sc_page_count)
230233
release_pages(ctxt->sc_pages, ctxt->sc_page_count);
231234

@@ -867,16 +870,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
867870
* in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
868871
*
869872
* Depending on whether a Write list or Reply chunk is present,
870-
* the server may send all, a portion of, or none of the xdr_buf.
873+
* the server may Send all, a portion of, or none of the xdr_buf.
871874
* In the latter case, only the transport header (sc_sges[0]) is
872875
* transmitted.
873876
*
874-
* RDMA Send is the last step of transmitting an RPC reply. Pages
875-
* involved in the earlier RDMA Writes are here transferred out
876-
* of the rqstp and into the sctxt's page array. These pages are
877-
* DMA unmapped by each Write completion, but the subsequent Send
878-
* completion finally releases these pages.
879-
*
880877
* Assumptions:
881878
* - The Reply's transport header will never be larger than a page.
882879
*/
@@ -885,20 +882,24 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
885882
const struct svc_rdma_recv_ctxt *rctxt,
886883
struct svc_rqst *rqstp)
887884
{
885+
struct ib_send_wr *send_wr = &sctxt->sc_send_wr;
888886
int ret;
889887

890888
ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl,
891889
&rctxt->rc_reply_pcl, &rqstp->rq_res);
892890
if (ret < 0)
893891
return ret;
894892

893+
/* Transfer pages involved in RDMA Writes to the sctxt's
894+
* page array. Completion handling releases these pages.
895+
*/
895896
svc_rdma_save_io_pages(rqstp, sctxt);
896897

897898
if (rctxt->rc_inv_rkey) {
898-
sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
899-
sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
899+
send_wr->opcode = IB_WR_SEND_WITH_INV;
900+
send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey;
900901
} else {
901-
sctxt->sc_send_wr.opcode = IB_WR_SEND;
902+
send_wr->opcode = IB_WR_SEND;
902903
}
903904

904905
return svc_rdma_post_send(rdma, sctxt);
@@ -1012,10 +1013,15 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
10121013
if (!p)
10131014
goto put_ctxt;
10141015

1015-
ret = svc_rdma_send_reply_chunk(rdma, rctxt, sctxt, &rqstp->rq_res);
1016-
if (ret < 0)
1017-
goto reply_chunk;
1018-
rc_size = ret;
1016+
rc_size = 0;
1017+
if (!pcl_is_empty(&rctxt->rc_reply_pcl)) {
1018+
ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl,
1019+
&rctxt->rc_reply_pcl, sctxt,
1020+
&rqstp->rq_res);
1021+
if (ret < 0)
1022+
goto reply_chunk;
1023+
rc_size = ret;
1024+
}
10191025

10201026
*p++ = *rdma_argp;
10211027
*p++ = *(rdma_argp + 1);

0 commit comments

Comments
 (0)