Skip to content

Commit 9c54bb0

Browse files
committed
OFI: move OFI_RETRY_UNTIL_DONE to common
In running some MPI stress tests on a HPE SS11 network we had to fall back to using the OFI BTL path. That in turn revealed some places in the BTL where we need to use a function similar to the MTL_OFI_RETRY_UNTIL_DONE macro. So as a first step move this macro to ofi common layer and invoke the more general opal_progress function. That's the content of this PR Additional changes needed to the OFI BTL will applied in subsequent PRs. The OFI MTL will require more work as the situation hit with the HPE CXI provider indicates a need to implement some kind of send backlog queueing mechanism in the MTL rather than simply spinning on the OFI CQs hoping for progress at the OFI provider level. Signed-off-by: Howard Pritchard <[email protected]>
1 parent 424151d commit 9c54bb0

File tree

2 files changed

+89
-88
lines changed

2 files changed

+89
-88
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 68 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
33
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2019-2023 Triad National Security, LLC. All rights
5+
* Copyright (c) 2019-2024 Triad National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved.
88
* reserved.
@@ -266,25 +266,6 @@ ompi_mtl_ofi_progress(void)
266266
return count;
267267
}
268268

269-
/**
270-
* When attempting to execute an OFI operation we need to handle
271-
* resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
272-
* the OFI mtl will attempt to progress any pending Completion Queue
273-
* events that may prevent additional operations to be enqueued.
274-
* If the call to ofi progress is successful, then the function call
275-
* will be retried.
276-
*/
277-
#define MTL_OFI_RETRY_UNTIL_DONE(FUNC, RETURN) \
278-
do { \
279-
do { \
280-
RETURN = FUNC; \
281-
if (OPAL_LIKELY(0 == RETURN)) {break;} \
282-
if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
283-
ompi_mtl_ofi_progress(); \
284-
} \
285-
} while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
286-
} while (0);
287-
288269
#define MTL_OFI_LOG_FI_ERR(err, string) \
289270
do { \
290271
opal_output_verbose(1, opal_common_ofi.output, \
@@ -636,12 +617,12 @@ ompi_mtl_ofi_post_recv_excid_buffer(bool blocking, struct ompi_communicator_t *c
636617
ofi_req->completion_count = 1;
637618
ofi_req->comm = comm;
638619

639-
MTL_OFI_RETRY_UNTIL_DONE(fi_recv(ompi_mtl_ofi.ofi_ctxt[0].rx_ep,
640-
start,
641-
length,
642-
NULL,
643-
FI_ADDR_UNSPEC,
644-
(void *)&ofi_req->ctx), ret);
620+
OFI_RETRY_UNTIL_DONE(fi_recv(ompi_mtl_ofi.ofi_ctxt[0].rx_ep,
621+
start,
622+
length,
623+
NULL,
624+
FI_ADDR_UNSPEC,
625+
(void *)&ofi_req->ctx), ret);
645626
if (OPAL_UNLIKELY(0 > ret)) {
646627
if (NULL != ofi_req->buffer) {
647628
free(ofi_req->buffer);
@@ -689,14 +670,14 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
689670

690671
ofi_req->completion_count += 1;
691672

692-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
693-
NULL,
694-
0,
695-
NULL,
696-
*src_addr,
697-
*match_bits | ompi_mtl_ofi.sync_send_ack,
698-
0, /* Exact match, no ignore bits */
699-
(void *) &ack_req->ctx), ret);
673+
OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
674+
NULL,
675+
0,
676+
NULL,
677+
*src_addr,
678+
*match_bits | ompi_mtl_ofi.sync_send_ack,
679+
0, /* Exact match, no ignore bits */
680+
(void *) &ack_req->ctx), ret);
700681
if (OPAL_UNLIKELY(0 > ret)) {
701682
opal_output_verbose(1, opal_common_ofi.output,
702683
"%s:%d: fi_trecv failed: %s(%zd)",
@@ -788,16 +769,16 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
788769

789770
if (ompi_mtl_ofi.max_inject_size >= length) {
790771
if (ofi_cq_data) {
791-
MTL_OFI_RETRY_UNTIL_DONE(fi_injectdata(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
792-
start,
793-
length,
794-
comm->c_my_rank,
795-
sep_peer_fiaddr), ret);
772+
OFI_RETRY_UNTIL_DONE(fi_injectdata(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
773+
start,
774+
length,
775+
comm->c_my_rank,
776+
sep_peer_fiaddr), ret);
796777
} else {
797-
MTL_OFI_RETRY_UNTIL_DONE(fi_inject(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
798-
start,
799-
length,
800-
sep_peer_fiaddr), ret);
778+
OFI_RETRY_UNTIL_DONE(fi_inject(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
779+
start,
780+
length,
781+
sep_peer_fiaddr), ret);
801782
}
802783
if (OPAL_UNLIKELY(0 > ret)) {
803784
MTL_OFI_LOG_FI_ERR(ret,
@@ -808,20 +789,20 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
808789
} else {
809790
ofi_req->completion_count = 1;
810791
if (ofi_cq_data) {
811-
MTL_OFI_RETRY_UNTIL_DONE(fi_senddata(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
812-
start,
813-
length,
814-
NULL,
815-
comm->c_my_rank,
816-
sep_peer_fiaddr,
817-
(void *) &ofi_req->ctx), ret);
792+
OFI_RETRY_UNTIL_DONE(fi_senddata(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
793+
start,
794+
length,
795+
NULL,
796+
comm->c_my_rank,
797+
sep_peer_fiaddr,
798+
(void *) &ofi_req->ctx), ret);
818799
} else {
819-
MTL_OFI_RETRY_UNTIL_DONE(fi_send(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
820-
start,
821-
length,
822-
NULL,
823-
sep_peer_fiaddr,
824-
(void *) &ofi_req->ctx), ret);
800+
OFI_RETRY_UNTIL_DONE(fi_send(ompi_mtl_ofi.ofi_ctxt[0].tx_ep,
801+
start,
802+
length,
803+
NULL,
804+
sep_peer_fiaddr,
805+
(void *) &ofi_req->ctx), ret);
825806
}
826807
if (OPAL_UNLIKELY(0 > ret)) {
827808
MTL_OFI_LOG_FI_ERR(ret,
@@ -952,14 +933,14 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
952933
if (!(convertor->flags & CONVERTOR_ACCELERATOR)
953934
&& (ompi_mtl_ofi.max_inject_size >= length)) {
954935
if (ofi_cq_data) {
955-
MTL_OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
956-
start,
957-
length,
958-
comm->c_my_rank,
959-
sep_peer_fiaddr,
960-
match_bits), ret);
936+
OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
937+
start,
938+
length,
939+
comm->c_my_rank,
940+
sep_peer_fiaddr,
941+
match_bits), ret);
961942
} else {
962-
MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
943+
OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
963944
start,
964945
length,
965946
sep_peer_fiaddr,
@@ -984,16 +965,16 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
984965
}
985966
ofi_req.completion_count += 1;
986967
if (ofi_cq_data) {
987-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
988-
start,
989-
length,
990-
(NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc,
991-
comm->c_my_rank,
992-
sep_peer_fiaddr,
993-
match_bits,
994-
(void *) &ofi_req.ctx), ret);
968+
OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
969+
start,
970+
length,
971+
(NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc,
972+
comm->c_my_rank,
973+
sep_peer_fiaddr,
974+
match_bits,
975+
(void *) &ofi_req.ctx), ret);
995976
} else {
996-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
977+
OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
997978
start,
998979
length,
999980
(NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc,
@@ -1092,8 +1073,8 @@ ompi_mtl_ofi_gen_ssend_ack(struct fi_cq_tagged_entry *wc,
10921073
tagged_msg.context = NULL;
10931074
tagged_msg.data = 0;
10941075

1095-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
1096-
&tagged_msg, 0), ret);
1076+
OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
1077+
&tagged_msg, 0), ret);
10971078
if (OPAL_UNLIKELY(0 > ret)) {
10981079
MTL_OFI_LOG_FI_ERR(ret, "fi_tsendmsg failed during ompi_mtl_ofi_gen_ssend_ack");
10991080
ret = OMPI_ERROR;
@@ -1238,16 +1219,16 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
12381219

12391220

12401221
if (ofi_cq_data) {
1241-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
1242-
start,
1243-
length,
1244-
(NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
1245-
comm->c_my_rank,
1246-
sep_peer_fiaddr,
1247-
match_bits,
1248-
(void *) &ofi_req->ctx), ret);
1222+
OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
1223+
start,
1224+
length,
1225+
(NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
1226+
comm->c_my_rank,
1227+
sep_peer_fiaddr,
1228+
match_bits,
1229+
(void *) &ofi_req->ctx), ret);
12491230
} else {
1250-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
1231+
OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
12511232
start,
12521233
length,
12531234
(NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
@@ -1456,7 +1437,7 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl,
14561437
return ompi_ret;
14571438
}
14581439

1459-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
1440+
OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
14601441
start,
14611442
length,
14621443
(NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
@@ -1608,7 +1589,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
16081589
msg.context = (void *)&ofi_req->ctx;
16091590
msg.data = 0;
16101591

1611-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1592+
OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
16121593
if (OPAL_UNLIKELY(0 > ret)) {
16131594
ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
16141595
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
@@ -1740,7 +1721,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
17401721
ofi_req.completion_count = 1;
17411722
ofi_req.match_state = 0;
17421723

1743-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1724+
OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
17441725
if (OPAL_UNLIKELY(0 > ret)) {
17451726
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
17461727
return ompi_mtl_ofi_get_error(ret);
@@ -1849,7 +1830,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
18491830
ofi_req->match_state = 0;
18501831
ofi_req->mask_bits = mask_bits;
18511832

1852-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1833+
OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
18531834
if (OPAL_UNLIKELY(0 > ret)) {
18541835
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
18551836
free(ofi_req);

opal/mca/common/ofi/common_ofi.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2015 Intel, Inc. All rights reserved.
44
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
55
* reserved.
6-
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
6+
* Copyright (c) 2020-2024 Triad National Security, LLC. All rights
77
* reserved.
88
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
99
* reserved.
@@ -32,6 +32,26 @@ typedef struct opal_common_ofi_module {
3232
int output;
3333
} opal_common_ofi_module_t;
3434

35+
/**
36+
* When attempting to execute an OFI operation we need to handle
37+
* resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
38+
* the OFI mtl/btl will attempt to progress any pending Completion Queue
39+
* events that may prevent additional operations to be enqueued.
40+
* If the call to ofi progress is successful, then the function call
41+
* will be retried.
42+
*/
43+
#define OFI_RETRY_UNTIL_DONE(FUNC, RETURN) \
44+
do { \
45+
do { \
46+
RETURN = FUNC; \
47+
if (OPAL_LIKELY(0 == RETURN)) {break;} \
48+
if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
49+
opal_progress(); \
50+
} \
51+
} while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
52+
} while (0);
53+
54+
3555
extern opal_common_ofi_module_t opal_common_ofi;
3656

3757
/**

0 commit comments

Comments
 (0)