Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 21b19ac

Browse files
author
rhc54
committed
Merge pull request #576 from yburette/v1.10
mtl/ofi: Cherry-pick changes from master.
2 parents c7c5700 + 00eaffe commit 21b19ac

File tree

1 file changed

+108
-97
lines changed

1 file changed

+108
-97
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 108 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,19 @@
3232
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
3333
#include "ompi/message/message.h"
3434

35-
#include "mtl_ofi.h"
3635
#include "mtl_ofi_types.h"
3736
#include "mtl_ofi_request.h"
3837
#include "mtl_ofi_endpoint.h"
3938
#include "mtl_ofi_compat.h"
4039

40+
#define MTL_OFI_RETRY_UNTIL_DONE(FUNC) \
41+
do { \
42+
do { \
43+
ret = FUNC; \
44+
if(OPAL_LIKELY(0 == ret)) {break;} \
45+
} while(-FI_EAGAIN == ret); \
46+
} while(0);
47+
4148
BEGIN_C_DECLS
4249

4350
extern mca_mtl_ofi_module_t ompi_mtl_ofi;
@@ -56,7 +63,8 @@ int ompi_mtl_ofi_progress_no_inline(void);
5663
__opal_attribute_always_inline__ static inline int
5764
ompi_mtl_ofi_progress(void)
5865
{
59-
int ret, count = 0;
66+
ssize_t ret;
67+
int count = 0;
6068
struct fi_cq_tagged_entry wc = { 0 };
6169
struct fi_cq_err_entry error = { 0 };
6270
ompi_mtl_ofi_request_t *ofi_req = NULL;
@@ -76,7 +84,7 @@ ompi_mtl_ofi_progress(void)
7684
ret = ofi_req->event_callback(&wc, ofi_req);
7785
if (OMPI_SUCCESS != ret) {
7886
opal_output(ompi_mtl_base_framework.framework_output,
79-
"Error returned by request event callback: %d",
87+
"Error returned by request event callback: %zd",
8088
ret);
8189
abort();
8290
}
@@ -91,7 +99,7 @@ ompi_mtl_ofi_progress(void)
9199
0);
92100
if (ret) {
93101
opal_output(ompi_mtl_base_framework.framework_output,
94-
"Error returned from fi_cq_readerr: %d", ret);
102+
"Error returned from fi_cq_readerr: %zd", ret);
95103
}
96104

97105
assert(error.op_context);
@@ -100,7 +108,7 @@ ompi_mtl_ofi_progress(void)
100108
ret = ofi_req->error_callback(&error, ofi_req);
101109
if (OMPI_SUCCESS != ret) {
102110
opal_output(ompi_mtl_base_framework.framework_output,
103-
"Error returned by request error callback: %d",
111+
"Error returned by request error callback: %zd",
104112
ret);
105113
abort();
106114
}
@@ -215,10 +223,10 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
215223
mca_pml_base_send_mode_t mode,
216224
ompi_mtl_ofi_request_t *ofi_req)
217225
{
218-
int ret;
226+
int ompi_ret;
219227
void *start;
220228
size_t length;
221-
ssize_t ret_length;
229+
ssize_t ret;
222230
bool free_after;
223231
uint64_t match_bits;
224232
ompi_proc_t *ompi_proc = NULL;
@@ -228,8 +236,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
228236
ompi_proc = ompi_comm_peer_lookup(comm, dest);
229237
endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
230238

231-
ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
232-
if (OMPI_SUCCESS != ret) return ret;
239+
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
240+
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
233241

234242
ofi_req->buffer = (free_after) ? start : NULL;
235243
ofi_req->length = length;
@@ -245,19 +253,18 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
245253
ofi_req->completion_count = 2;
246254
MTL_OFI_SET_SEND_BITS(match_bits, comm->c_contextid,
247255
comm->c_my_rank, tag, MTL_OFI_SYNC_SEND);
248-
ret_length = fi_trecv(ompi_mtl_ofi.ep,
249-
NULL,
250-
0,
251-
NULL,
252-
endpoint->peer_fiaddr,
253-
match_bits | MTL_OFI_SYNC_SEND_ACK,
254-
0, /* Exact match, no ignore bits */
255-
(void *) &ack_req->ctx);
256-
if (OPAL_UNLIKELY(ret_length < 0)) {
256+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
257+
NULL,
258+
0,
259+
NULL,
260+
endpoint->peer_fiaddr,
261+
match_bits | MTL_OFI_SYNC_SEND_ACK,
262+
0, /* Exact match, no ignore bits */
263+
(void *) &ack_req->ctx));
264+
if (OPAL_UNLIKELY(0 > ret)) {
257265
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
258266
"%s:%d: fi_trecv failed: %s(%zd)",
259-
__FILE__, __LINE__,
260-
strerror(errno), ret_length);
267+
__FILE__, __LINE__, fi_strerror(-ret), ret);
261268
return ompi_mtl_ofi_get_error(ret);
262269
}
263270
} else {
@@ -267,32 +274,31 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
267274
}
268275

269276
if (ompi_mtl_ofi.max_inject_size >= length) {
270-
ret_length = fi_tinject(ompi_mtl_ofi.ep,
271-
start,
272-
length,
273-
endpoint->peer_fiaddr,
274-
match_bits);
275-
if (OPAL_UNLIKELY(0 > ret_length)) {
277+
MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep,
278+
start,
279+
length,
280+
endpoint->peer_fiaddr,
281+
match_bits));
282+
if (OPAL_UNLIKELY(0 > ret)) {
276283
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
277-
"%s:%d: fi_tinject failed: %zd",
278-
__FILE__, __LINE__, ret_length);
284+
"%s:%d: fi_tinject failed: %s(%zd)",
285+
__FILE__, __LINE__, fi_strerror(-ret), ret);
279286
return ompi_mtl_ofi_get_error(ret);
280287
}
281288

282289
ofi_req->event_callback(NULL,ofi_req);
283290
} else {
284-
ret_length = fi_tsend(ompi_mtl_ofi.ep,
285-
start,
286-
length,
287-
NULL,
288-
endpoint->peer_fiaddr,
289-
match_bits,
290-
(void *) &ofi_req->ctx);
291-
292-
if (OPAL_UNLIKELY(0 > ret_length)) {
291+
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
292+
start,
293+
length,
294+
NULL,
295+
endpoint->peer_fiaddr,
296+
match_bits,
297+
(void *) &ofi_req->ctx));
298+
if (OPAL_UNLIKELY(0 > ret)) {
293299
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
294-
"%s:%d: fi_tsend failed: %zd",
295-
__FILE__, __LINE__, ret_length);
300+
"%s:%d: fi_tsend failed: %s(%zd)",
301+
__FILE__, __LINE__, fi_strerror(-ret), ret);
296302
return ompi_mtl_ofi_get_error(ret);
297303
}
298304
}
@@ -388,8 +394,8 @@ __opal_attribute_always_inline__ static inline int
388394
ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
389395
ompi_mtl_ofi_request_t *ofi_req)
390396
{
391-
int ret;
392-
ssize_t ret_length;
397+
int ompi_ret;
398+
ssize_t ret;
393399
ompi_proc_t *ompi_proc = NULL;
394400
mca_mtl_ofi_endpoint_t *endpoint = NULL;
395401
int src;
@@ -419,14 +425,14 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
419425
* Unpack data into recv buffer if necessary.
420426
*/
421427
if (OPAL_UNLIKELY(ofi_req->buffer)) {
422-
ret = ompi_mtl_datatype_unpack(ofi_req->convertor,
423-
ofi_req->buffer,
424-
wc->len);
425-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
428+
ompi_ret = ompi_mtl_datatype_unpack(ofi_req->convertor,
429+
ofi_req->buffer,
430+
wc->len);
431+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
426432
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
427433
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
428-
__FILE__, __LINE__, ret);
429-
status->MPI_ERROR = ret;
434+
__FILE__, __LINE__, ompi_ret);
435+
status->MPI_ERROR = ompi_ret;
430436
}
431437
}
432438

@@ -456,18 +462,17 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
456462
endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
457463
ofi_req->remote_addr = endpoint->peer_fiaddr;
458464
}
459-
ret_length = fi_tsend(ompi_mtl_ofi.ep,
460-
NULL,
461-
0,
462-
NULL,
463-
ofi_req->remote_addr,
464-
wc->tag | MTL_OFI_SYNC_SEND_ACK,
465-
(void *) &ofi_req->ctx);
466-
467-
if (OPAL_UNLIKELY(ret_length < 0)) {
465+
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
466+
NULL,
467+
0,
468+
NULL,
469+
ofi_req->remote_addr,
470+
wc->tag | MTL_OFI_SYNC_SEND_ACK,
471+
(void *) &ofi_req->ctx));
472+
if (OPAL_UNLIKELY(0 > ret)) {
468473
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
469-
"%s:%d: fi_tsend failed: %zd",
470-
__FILE__, __LINE__, ret_length);
474+
"%s:%d: fi_tsend failed: %s(%zd)",
475+
__FILE__, __LINE__, fi_strerror(-ret), ret);
471476
status->MPI_ERROR = OMPI_ERROR;
472477
}
473478
} else {
@@ -513,8 +518,8 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
513518
struct opal_convertor_t *convertor,
514519
mca_mtl_request_t *mtl_request)
515520
{
516-
int ret = OMPI_SUCCESS;
517-
ssize_t ret_length;
521+
int ompi_ret = OMPI_SUCCESS;
522+
ssize_t ret;
518523
uint64_t match_bits, mask_bits;
519524
fi_addr_t remote_addr;
520525
ompi_proc_t *ompi_proc = NULL;
@@ -534,9 +539,12 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
534539

535540
MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag);
536541

537-
ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after);
538-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
539-
return ret;
542+
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
543+
&start,
544+
&length,
545+
&free_after);
546+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
547+
return ompi_ret;
540548
}
541549

542550
ofi_req->type = OMPI_MTL_OFI_RECV;
@@ -551,22 +559,21 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
551559
ofi_req->remote_addr = remote_addr;
552560
ofi_req->match_bits = match_bits;
553561

554-
ret_length = fi_trecv(ompi_mtl_ofi.ep,
555-
start,
556-
length,
557-
NULL,
558-
remote_addr,
559-
match_bits,
560-
mask_bits,
561-
(void *)&ofi_req->ctx);
562-
563-
if (OPAL_UNLIKELY(ret_length < 0)) {
562+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
563+
start,
564+
length,
565+
NULL,
566+
remote_addr,
567+
match_bits,
568+
mask_bits,
569+
(void *)&ofi_req->ctx));
570+
if (OPAL_UNLIKELY(0 > ret)) {
564571
if (NULL != ofi_req->buffer) {
565572
free(ofi_req->buffer);
566573
}
567574
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
568575
"%s:%d: fi_trecv failed: %s(%zd)",
569-
__FILE__, __LINE__, strerror(errno), ret_length);
576+
__FILE__, __LINE__, fi_strerror(-ret), ret);
570577
return ompi_mtl_ofi_get_error(ret);
571578
}
572579

@@ -637,12 +644,16 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
637644
bool free_after;
638645
struct iovec iov;
639646
struct fi_msg_tagged msg;
640-
int ret;
647+
int ompi_ret;
648+
ssize_t ret;
641649
uint64_t msgflags = FI_CLAIM;
642650

643-
ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, &free_after);
644-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
645-
return ret;
651+
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
652+
&start,
653+
&length,
654+
&free_after);
655+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
656+
return ompi_ret;
646657
}
647658

648659
ofi_req->type = OMPI_MTL_OFI_RECV;
@@ -668,12 +679,12 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
668679
msg.context = (void *)&ofi_req->ctx;
669680
msg.data = 0;
670681

671-
ret = fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags);
672-
if (ret < 0) {
682+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
683+
if (OPAL_UNLIKELY(0 > ret)) {
673684
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
674-
"%s:%d: unexpected return code from fi_trecvmsg: %d",
675-
__FILE__, __LINE__, ret);
676-
return ompi_mtl_ofi_get_error(-ret);
685+
"%s:%d: fi_trecvmsg failed: %s(%zd)",
686+
__FILE__, __LINE__, fi_strerror(-ret), ret);
687+
return ompi_mtl_ofi_get_error(ret);
677688
}
678689

679690
return OMPI_SUCCESS;
@@ -723,7 +734,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
723734
mca_mtl_ofi_endpoint_t *endpoint = NULL;
724735
fi_addr_t remote_proc = 0;
725736
uint64_t match_bits, mask_bits;
726-
int ret;
737+
ssize_t ret;
727738
struct fi_msg_tagged msg;
728739
uint64_t msgflags = FI_PEEK;
729740

@@ -761,18 +772,18 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
761772
ofi_req.completion_count = 1;
762773
ofi_req.match_state = 0;
763774

764-
ret = fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags);
765-
if (ret < 0 && -FI_ENOMSG == ret) {
775+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
776+
if (-FI_ENOMSG == ret) {
766777
/**
767778
* The search request completed but no matching message was found.
768779
*/
769780
*flag = 0;
770781
return OMPI_SUCCESS;
771-
} else if (ret < 0) {
782+
} else if (OPAL_UNLIKELY(0 > ret)) {
772783
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
773-
"%s:%d: unexpected return code from fi_trecvmsg: %d",
774-
__FILE__, __LINE__, ret);
775-
return ompi_mtl_ofi_get_error(-ret);
784+
"%s:%d: fi_trecvmsg failed: %s(%zd)",
785+
__FILE__, __LINE__, fi_strerror(-ret), ret);
786+
return ompi_mtl_ofi_get_error(ret);
776787
}
777788

778789
while (0 < ofi_req.completion_count) {
@@ -803,7 +814,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
803814
mca_mtl_ofi_endpoint_t *endpoint = NULL;
804815
fi_addr_t remote_proc = 0;
805816
uint64_t match_bits, mask_bits;
806-
int ret;
817+
ssize_t ret;
807818
struct fi_msg_tagged msg;
808819
uint64_t msgflags = FI_PEEK | FI_CLAIM;
809820

@@ -846,18 +857,18 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
846857
ofi_req->completion_count = 1;
847858
ofi_req->match_state = 0;
848859

849-
ret = fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags);
850-
if (ret < 0 && -FI_ENOMSG == ret) {
860+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
861+
if (-FI_ENOMSG == ret) {
851862
/**
852863
* The search request completed but no matching message was found.
853864
*/
854865
*matched = 0;
855866
return OMPI_SUCCESS;
856-
} else if (ret < 0) {
867+
} else if (OPAL_UNLIKELY(0 > ret)) {
857868
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
858-
"%s:%d: unexpected return code from fi_trecvmsg: %d",
859-
__FILE__, __LINE__, ret);
860-
return ompi_mtl_ofi_get_error(-ret);
869+
"%s:%d: fi_trecvmsg failed: %s(%zd)",
870+
__FILE__, __LINE__, fi_strerror(-ret), ret);
871+
return ompi_mtl_ofi_get_error(ret);
861872
}
862873

863874
while (0 < ofi_req->completion_count) {

0 commit comments

Comments
 (0)