3232#include "ompi/mca/mtl/base/mtl_base_datatype.h"
3333#include "ompi/message/message.h"
3434
35- #include "mtl_ofi.h"
3635#include "mtl_ofi_types.h"
3736#include "mtl_ofi_request.h"
3837#include "mtl_ofi_endpoint.h"
3938#include "mtl_ofi_compat.h"
4039
40+ #define MTL_OFI_RETRY_UNTIL_DONE (FUNC ) \
41+ do { \
42+ do { \
43+ ret = FUNC; \
44+ if(OPAL_LIKELY(0 == ret)) {break;} \
45+ } while(-FI_EAGAIN == ret); \
46+ } while(0);
47+
4148BEGIN_C_DECLS
4249
4350extern mca_mtl_ofi_module_t ompi_mtl_ofi ;
@@ -56,7 +63,8 @@ int ompi_mtl_ofi_progress_no_inline(void);
5663__opal_attribute_always_inline__ static inline int
5764ompi_mtl_ofi_progress (void )
5865{
59- int ret , count = 0 ;
66+ ssize_t ret ;
67+ int count = 0 ;
6068 struct fi_cq_tagged_entry wc = { 0 };
6169 struct fi_cq_err_entry error = { 0 };
6270 ompi_mtl_ofi_request_t * ofi_req = NULL ;
@@ -76,7 +84,7 @@ ompi_mtl_ofi_progress(void)
7684 ret = ofi_req -> event_callback (& wc , ofi_req );
7785 if (OMPI_SUCCESS != ret ) {
7886 opal_output (ompi_mtl_base_framework .framework_output ,
79- "Error returned by request event callback: %d " ,
87+ "Error returned by request event callback: %zd " ,
8088 ret );
8189 abort ();
8290 }
@@ -91,7 +99,7 @@ ompi_mtl_ofi_progress(void)
9199 0 );
92100 if (ret ) {
93101 opal_output (ompi_mtl_base_framework .framework_output ,
94- "Error returned from fi_cq_readerr: %d " , ret );
102+ "Error returned from fi_cq_readerr: %zd " , ret );
95103 }
96104
97105 assert (error .op_context );
@@ -100,7 +108,7 @@ ompi_mtl_ofi_progress(void)
100108 ret = ofi_req -> error_callback (& error , ofi_req );
101109 if (OMPI_SUCCESS != ret ) {
102110 opal_output (ompi_mtl_base_framework .framework_output ,
103- "Error returned by request error callback: %d " ,
111+ "Error returned by request error callback: %zd " ,
104112 ret );
105113 abort ();
106114 }
@@ -215,10 +223,10 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
215223 mca_pml_base_send_mode_t mode ,
216224 ompi_mtl_ofi_request_t * ofi_req )
217225{
218- int ret ;
226+ int ompi_ret ;
219227 void * start ;
220228 size_t length ;
221- ssize_t ret_length ;
229+ ssize_t ret ;
222230 bool free_after ;
223231 uint64_t match_bits ;
224232 ompi_proc_t * ompi_proc = NULL ;
@@ -228,8 +236,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
228236 ompi_proc = ompi_comm_peer_lookup (comm , dest );
229237 endpoint = ompi_proc -> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_MTL ];
230238
231- ret = ompi_mtl_datatype_pack (convertor , & start , & length , & free_after );
232- if (OMPI_SUCCESS != ret ) return ret ;
239+ ompi_ret = ompi_mtl_datatype_pack (convertor , & start , & length , & free_after );
240+ if (OMPI_SUCCESS != ompi_ret ) return ompi_ret ;
233241
234242 ofi_req -> buffer = (free_after ) ? start : NULL ;
235243 ofi_req -> length = length ;
@@ -245,19 +253,18 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
245253 ofi_req -> completion_count = 2 ;
246254 MTL_OFI_SET_SEND_BITS (match_bits , comm -> c_contextid ,
247255 comm -> c_my_rank , tag , MTL_OFI_SYNC_SEND );
248- ret_length = fi_trecv (ompi_mtl_ofi .ep ,
249- NULL ,
250- 0 ,
251- NULL ,
252- endpoint -> peer_fiaddr ,
253- match_bits | MTL_OFI_SYNC_SEND_ACK ,
254- 0 , /* Exact match, no ignore bits */
255- (void * ) & ack_req -> ctx );
256- if (OPAL_UNLIKELY (ret_length < 0 )) {
256+ MTL_OFI_RETRY_UNTIL_DONE ( fi_trecv (ompi_mtl_ofi .ep ,
257+ NULL ,
258+ 0 ,
259+ NULL ,
260+ endpoint -> peer_fiaddr ,
261+ match_bits | MTL_OFI_SYNC_SEND_ACK ,
262+ 0 , /* Exact match, no ignore bits */
263+ (void * ) & ack_req -> ctx ) );
264+ if (OPAL_UNLIKELY (0 > ret )) {
257265 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
258266 "%s:%d: fi_trecv failed: %s(%zd)" ,
259- __FILE__ , __LINE__ ,
260- strerror (errno ), ret_length );
267+ __FILE__ , __LINE__ , fi_strerror (- ret ), ret );
261268 return ompi_mtl_ofi_get_error (ret );
262269 }
263270 } else {
@@ -267,32 +274,31 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
267274 }
268275
269276 if (ompi_mtl_ofi .max_inject_size >= length ) {
270- ret_length = fi_tinject (ompi_mtl_ofi .ep ,
271- start ,
272- length ,
273- endpoint -> peer_fiaddr ,
274- match_bits );
275- if (OPAL_UNLIKELY (0 > ret_length )) {
277+ MTL_OFI_RETRY_UNTIL_DONE ( fi_tinject (ompi_mtl_ofi .ep ,
278+ start ,
279+ length ,
280+ endpoint -> peer_fiaddr ,
281+ match_bits ) );
282+ if (OPAL_UNLIKELY (0 > ret )) {
276283 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
277- "%s:%d: fi_tinject failed: %zd " ,
278- __FILE__ , __LINE__ , ret_length );
284+ "%s:%d: fi_tinject failed: %s(%zd) " ,
285+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
279286 return ompi_mtl_ofi_get_error (ret );
280287 }
281288
282289 ofi_req -> event_callback (NULL ,ofi_req );
283290 } else {
284- ret_length = fi_tsend (ompi_mtl_ofi .ep ,
285- start ,
286- length ,
287- NULL ,
288- endpoint -> peer_fiaddr ,
289- match_bits ,
290- (void * ) & ofi_req -> ctx );
291-
292- if (OPAL_UNLIKELY (0 > ret_length )) {
291+ MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
292+ start ,
293+ length ,
294+ NULL ,
295+ endpoint -> peer_fiaddr ,
296+ match_bits ,
297+ (void * ) & ofi_req -> ctx ));
298+ if (OPAL_UNLIKELY (0 > ret )) {
293299 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
294- "%s:%d: fi_tsend failed: %zd " ,
295- __FILE__ , __LINE__ , ret_length );
300+ "%s:%d: fi_tsend failed: %s(%zd) " ,
301+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
296302 return ompi_mtl_ofi_get_error (ret );
297303 }
298304 }
@@ -388,8 +394,8 @@ __opal_attribute_always_inline__ static inline int
388394ompi_mtl_ofi_recv_callback (struct fi_cq_tagged_entry * wc ,
389395 ompi_mtl_ofi_request_t * ofi_req )
390396{
391- int ret ;
392- ssize_t ret_length ;
397+ int ompi_ret ;
398+ ssize_t ret ;
393399 ompi_proc_t * ompi_proc = NULL ;
394400 mca_mtl_ofi_endpoint_t * endpoint = NULL ;
395401 int src ;
@@ -419,14 +425,14 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
419425 * Unpack data into recv buffer if necessary.
420426 */
421427 if (OPAL_UNLIKELY (ofi_req -> buffer )) {
422- ret = ompi_mtl_datatype_unpack (ofi_req -> convertor ,
423- ofi_req -> buffer ,
424- wc -> len );
425- if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
428+ ompi_ret = ompi_mtl_datatype_unpack (ofi_req -> convertor ,
429+ ofi_req -> buffer ,
430+ wc -> len );
431+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ompi_ret )) {
426432 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
427433 "%s:%d: ompi_mtl_datatype_unpack failed: %d" ,
428- __FILE__ , __LINE__ , ret );
429- status -> MPI_ERROR = ret ;
434+ __FILE__ , __LINE__ , ompi_ret );
435+ status -> MPI_ERROR = ompi_ret ;
430436 }
431437 }
432438
@@ -456,18 +462,17 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
456462 endpoint = ompi_proc -> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_MTL ];
457463 ofi_req -> remote_addr = endpoint -> peer_fiaddr ;
458464 }
459- ret_length = fi_tsend (ompi_mtl_ofi .ep ,
460- NULL ,
461- 0 ,
462- NULL ,
463- ofi_req -> remote_addr ,
464- wc -> tag | MTL_OFI_SYNC_SEND_ACK ,
465- (void * ) & ofi_req -> ctx );
466-
467- if (OPAL_UNLIKELY (ret_length < 0 )) {
465+ MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
466+ NULL ,
467+ 0 ,
468+ NULL ,
469+ ofi_req -> remote_addr ,
470+ wc -> tag | MTL_OFI_SYNC_SEND_ACK ,
471+ (void * ) & ofi_req -> ctx ));
472+ if (OPAL_UNLIKELY (0 > ret )) {
468473 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
469- "%s:%d: fi_tsend failed: %zd " ,
470- __FILE__ , __LINE__ , ret_length );
474+ "%s:%d: fi_tsend failed: %s(%zd) " ,
475+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
471476 status -> MPI_ERROR = OMPI_ERROR ;
472477 }
473478 } else {
@@ -513,8 +518,8 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
513518 struct opal_convertor_t * convertor ,
514519 mca_mtl_request_t * mtl_request )
515520{
516- int ret = OMPI_SUCCESS ;
517- ssize_t ret_length ;
521+ int ompi_ret = OMPI_SUCCESS ;
522+ ssize_t ret ;
518523 uint64_t match_bits , mask_bits ;
519524 fi_addr_t remote_addr ;
520525 ompi_proc_t * ompi_proc = NULL ;
@@ -534,9 +539,12 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
534539
535540 MTL_OFI_SET_RECV_BITS (match_bits , mask_bits , comm -> c_contextid , src , tag );
536541
537- ret = ompi_mtl_datatype_recv_buf (convertor , & start , & length , & free_after );
538- if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
539- return ret ;
542+ ompi_ret = ompi_mtl_datatype_recv_buf (convertor ,
543+ & start ,
544+ & length ,
545+ & free_after );
546+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ompi_ret )) {
547+ return ompi_ret ;
540548 }
541549
542550 ofi_req -> type = OMPI_MTL_OFI_RECV ;
@@ -551,22 +559,21 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
551559 ofi_req -> remote_addr = remote_addr ;
552560 ofi_req -> match_bits = match_bits ;
553561
554- ret_length = fi_trecv (ompi_mtl_ofi .ep ,
555- start ,
556- length ,
557- NULL ,
558- remote_addr ,
559- match_bits ,
560- mask_bits ,
561- (void * )& ofi_req -> ctx );
562-
563- if (OPAL_UNLIKELY (ret_length < 0 )) {
562+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ep ,
563+ start ,
564+ length ,
565+ NULL ,
566+ remote_addr ,
567+ match_bits ,
568+ mask_bits ,
569+ (void * )& ofi_req -> ctx ));
570+ if (OPAL_UNLIKELY (0 > ret )) {
564571 if (NULL != ofi_req -> buffer ) {
565572 free (ofi_req -> buffer );
566573 }
567574 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
568575 "%s:%d: fi_trecv failed: %s(%zd)" ,
569- __FILE__ , __LINE__ , strerror ( errno ), ret_length );
576+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
570577 return ompi_mtl_ofi_get_error (ret );
571578 }
572579
@@ -637,12 +644,16 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
637644 bool free_after ;
638645 struct iovec iov ;
639646 struct fi_msg_tagged msg ;
640- int ret ;
647+ int ompi_ret ;
648+ ssize_t ret ;
641649 uint64_t msgflags = FI_CLAIM ;
642650
643- ret = ompi_mtl_datatype_recv_buf (convertor , & start , & length , & free_after );
644- if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
645- return ret ;
651+ ompi_ret = ompi_mtl_datatype_recv_buf (convertor ,
652+ & start ,
653+ & length ,
654+ & free_after );
655+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ompi_ret )) {
656+ return ompi_ret ;
646657 }
647658
648659 ofi_req -> type = OMPI_MTL_OFI_RECV ;
@@ -668,12 +679,12 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
668679 msg .context = (void * )& ofi_req -> ctx ;
669680 msg .data = 0 ;
670681
671- ret = fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
672- if (ret < 0 ) {
682+ MTL_OFI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
683+ if (OPAL_UNLIKELY ( 0 > ret ) ) {
673684 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
674- "%s:%d: unexpected return code from fi_trecvmsg: %d " ,
675- __FILE__ , __LINE__ , ret );
676- return ompi_mtl_ofi_get_error (- ret );
685+ "%s:%d: fi_trecvmsg failed : %s(%zd) " ,
686+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
687+ return ompi_mtl_ofi_get_error (ret );
677688 }
678689
679690 return OMPI_SUCCESS ;
@@ -723,7 +734,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
723734 mca_mtl_ofi_endpoint_t * endpoint = NULL ;
724735 fi_addr_t remote_proc = 0 ;
725736 uint64_t match_bits , mask_bits ;
726- int ret ;
737+ ssize_t ret ;
727738 struct fi_msg_tagged msg ;
728739 uint64_t msgflags = FI_PEEK ;
729740
@@ -761,18 +772,18 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
761772 ofi_req .completion_count = 1 ;
762773 ofi_req .match_state = 0 ;
763774
764- ret = fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
765- if (ret < 0 && - FI_ENOMSG == ret ) {
775+ MTL_OFI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
776+ if (- FI_ENOMSG == ret ) {
766777 /**
767778 * The search request completed but no matching message was found.
768779 */
769780 * flag = 0 ;
770781 return OMPI_SUCCESS ;
771- } else if (ret < 0 ) {
782+ } else if (OPAL_UNLIKELY ( 0 > ret ) ) {
772783 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
773- "%s:%d: unexpected return code from fi_trecvmsg: %d " ,
774- __FILE__ , __LINE__ , ret );
775- return ompi_mtl_ofi_get_error (- ret );
784+ "%s:%d: fi_trecvmsg failed : %s(%zd) " ,
785+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
786+ return ompi_mtl_ofi_get_error (ret );
776787 }
777788
778789 while (0 < ofi_req .completion_count ) {
@@ -803,7 +814,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
803814 mca_mtl_ofi_endpoint_t * endpoint = NULL ;
804815 fi_addr_t remote_proc = 0 ;
805816 uint64_t match_bits , mask_bits ;
806- int ret ;
817+ ssize_t ret ;
807818 struct fi_msg_tagged msg ;
808819 uint64_t msgflags = FI_PEEK | FI_CLAIM ;
809820
@@ -846,18 +857,18 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
846857 ofi_req -> completion_count = 1 ;
847858 ofi_req -> match_state = 0 ;
848859
849- ret = fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
850- if (ret < 0 && - FI_ENOMSG == ret ) {
860+ MTL_OFI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
861+ if (- FI_ENOMSG == ret ) {
851862 /**
852863 * The search request completed but no matching message was found.
853864 */
854865 * matched = 0 ;
855866 return OMPI_SUCCESS ;
856- } else if (ret < 0 ) {
867+ } else if (OPAL_UNLIKELY ( 0 > ret ) ) {
857868 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
858- "%s:%d: unexpected return code from fi_trecvmsg: %d " ,
859- __FILE__ , __LINE__ , ret );
860- return ompi_mtl_ofi_get_error (- ret );
869+ "%s:%d: fi_trecvmsg failed : %s(%zd) " ,
870+ __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
871+ return ompi_mtl_ofi_get_error (ret );
861872 }
862873
863874 while (0 < ofi_req -> completion_count ) {
0 commit comments