3838#include  "mtl_ofi_endpoint.h" 
3939#include  "mtl_ofi_compat.h" 
4040
41+ #define  FI_RETRY_UNTIL_DONE (FUNC ) \
42+     do { \
43+         do { \
44+             ret = FUNC; \
45+             if(OPAL_LIKELY(0 == ret)) {break;} \
46+         } while(-FI_EAGAIN == ret); \
47+     } while(0);
48+ 
4149BEGIN_C_DECLS 
4250
4351extern  mca_mtl_ofi_module_t  ompi_mtl_ofi ;
@@ -56,7 +64,8 @@ int ompi_mtl_ofi_progress_no_inline(void);
5664__opal_attribute_always_inline__  static  inline  int 
5765ompi_mtl_ofi_progress (void )
5866{
59-     int  ret , count  =  0 ;
67+     ssize_t  ret ;
68+     int  count  =  0 ;
6069    struct  fi_cq_tagged_entry  wc  =  { 0  };
6170    struct  fi_cq_err_entry  error  =  { 0  };
6271    ompi_mtl_ofi_request_t  * ofi_req  =  NULL ;
@@ -215,10 +224,10 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
215224                        mca_pml_base_send_mode_t  mode ,
216225                        ompi_mtl_ofi_request_t  * ofi_req )
217226{
218-     int  ret ;
227+     int  ompi_ret ;
219228    void  * start ;
220229    size_t  length ;
221-     ssize_t  ret_length ;
230+     ssize_t  ret ;
222231    bool  free_after ;
223232    uint64_t  match_bits ;
224233    ompi_proc_t  * ompi_proc  =  NULL ;
@@ -228,8 +237,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
228237    ompi_proc  =  ompi_comm_peer_lookup (comm , dest );
229238    endpoint  =  ompi_proc -> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_MTL ];
230239
231-     ret  =  ompi_mtl_datatype_pack (convertor , & start , & length , & free_after );
232-     if  (OMPI_SUCCESS  !=  ret ) return  ret ;
240+     ompi_ret  =  ompi_mtl_datatype_pack (convertor , & start , & length , & free_after );
241+     if  (OMPI_SUCCESS  !=  ompi_ret ) return  ompi_ret ;
233242
234243    ofi_req -> buffer  =  (free_after ) ? start  : NULL ;
235244    ofi_req -> length  =  length ;
@@ -245,19 +254,18 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
245254        ofi_req -> completion_count  =  2 ;
246255        MTL_OFI_SET_SEND_BITS (match_bits , comm -> c_contextid ,
247256                              comm -> c_my_rank , tag , MTL_OFI_SYNC_SEND );
248-         ret_length   =   fi_trecv (ompi_mtl_ofi .ep ,
249-                               NULL ,
250-                               0 ,
251-                               NULL ,
252-                               endpoint -> peer_fiaddr ,
253-                               match_bits  | MTL_OFI_SYNC_SEND_ACK ,
254-                               0 , /* Exact match, no ignore bits */ 
255-                               (void  * ) & ack_req -> ctx );
256-         if  (OPAL_UNLIKELY (ret_length   <   0 )) {
257+         FI_RETRY_UNTIL_DONE ( fi_trecv (ompi_mtl_ofi .ep ,
258+                                       NULL ,
259+                                       0 ,
260+                                       NULL ,
261+                                       endpoint -> peer_fiaddr ,
262+                                       match_bits  | MTL_OFI_SYNC_SEND_ACK ,
263+                                       0 , /* Exact match, no ignore bits */ 
264+                                       (void  * ) & ack_req -> ctx ) );
265+         if  (OPAL_UNLIKELY (0   >   ret )) {
257266            opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
258267                                "%s:%d: fi_trecv failed: %s(%zd)" ,
259-                                 __FILE__ , __LINE__ ,
260-                                 strerror (errno ), ret_length );
268+                                 __FILE__ , __LINE__ , fi_strerror (- ret ), ret );
261269            return  ompi_mtl_ofi_get_error (ret );
262270        }
263271    } else  {
@@ -267,32 +275,31 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
267275    }
268276
269277    if  (ompi_mtl_ofi .max_inject_size  >= length ) {
270-         ret_length   =   fi_tinject (ompi_mtl_ofi .ep ,
271-                                 start ,
272-                                 length ,
273-                                 endpoint -> peer_fiaddr ,
274-                                 match_bits );
275-         if  (OPAL_UNLIKELY (0  >  ret_length )) {
278+         FI_RETRY_UNTIL_DONE ( fi_tinject (ompi_mtl_ofi .ep ,
279+                                         start ,
280+                                         length ,
281+                                         endpoint -> peer_fiaddr ,
282+                                         match_bits ) );
283+         if  (OPAL_UNLIKELY (0  >  ret )) {
276284            opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
277-                                 "%s:%d: fi_tinject failed: %zd " ,
278-                                 __FILE__ , __LINE__ , ret_length );
285+                                 "%s:%d: fi_tinject failed: %s(%zd) " ,
286+                                 __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
279287            return  ompi_mtl_ofi_get_error (ret );
280288        }
281289
282290        ofi_req -> event_callback (NULL ,ofi_req );
283291    } else  {
284-         ret_length  =  fi_tsend (ompi_mtl_ofi .ep ,
285-                               start ,
286-                               length ,
287-                               NULL ,
288-                               endpoint -> peer_fiaddr ,
289-                               match_bits ,
290-                               (void  * ) & ofi_req -> ctx );
291- 
292-         if  (OPAL_UNLIKELY (0  >  ret_length )) {
292+         FI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
293+                                      start ,
294+                                      length ,
295+                                      NULL ,
296+                                      endpoint -> peer_fiaddr ,
297+                                      match_bits ,
298+                                      (void  * ) & ofi_req -> ctx ));
299+         if  (OPAL_UNLIKELY (0  >  ret )) {
293300            opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
294-                                 "%s:%d: fi_tsend failed: %zd " ,
295-                                 __FILE__ , __LINE__ , ret_length );
301+                                 "%s:%d: fi_tsend failed: %s(%zd) " ,
302+                                 __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
296303            return  ompi_mtl_ofi_get_error (ret );
297304        }
298305    }
@@ -388,8 +395,8 @@ __opal_attribute_always_inline__ static inline int
388395ompi_mtl_ofi_recv_callback (struct  fi_cq_tagged_entry  * wc ,
389396                           ompi_mtl_ofi_request_t  * ofi_req )
390397{
391-     int  ret ;
392-     ssize_t  ret_length ;
398+     int  ompi_ret ;
399+     ssize_t  ret ;
393400    ompi_proc_t  * ompi_proc  =  NULL ;
394401    mca_mtl_ofi_endpoint_t  * endpoint  =  NULL ;
395402    int  src ;
@@ -419,14 +426,14 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
419426     * Unpack data into recv buffer if necessary. 
420427     */ 
421428    if  (OPAL_UNLIKELY (ofi_req -> buffer )) {
422-         ret  =  ompi_mtl_datatype_unpack (ofi_req -> convertor ,
423-                                        ofi_req -> buffer ,
424-                                        wc -> len );
425-         if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ret )) {
429+         ompi_ret  =  ompi_mtl_datatype_unpack (ofi_req -> convertor ,
430+                                              ofi_req -> buffer ,
431+                                              wc -> len );
432+         if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ompi_ret )) {
426433            opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
427434                                "%s:%d: ompi_mtl_datatype_unpack failed: %d" ,
428-                                 __FILE__ , __LINE__ , ret );
429-             status -> MPI_ERROR  =  ret ;
435+                                 __FILE__ , __LINE__ , ompi_ret );
436+             status -> MPI_ERROR  =  ompi_ret ;
430437        }
431438    }
432439
@@ -456,18 +463,17 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
456463            endpoint  =  ompi_proc -> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_MTL ];
457464            ofi_req -> remote_addr  =  endpoint -> peer_fiaddr ;
458465        }
459- 	    ret_length  =  fi_tsend (ompi_mtl_ofi .ep ,
460-                               NULL ,
461-                               0 ,
462-                               NULL ,
463-                               ofi_req -> remote_addr ,
464-                               wc -> tag  | MTL_OFI_SYNC_SEND_ACK ,
465-                               (void  * ) & ofi_req -> ctx );
466- 
467-         if  (OPAL_UNLIKELY (ret_length  <  0 )) {
466+ 	    FI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
467+                                      NULL ,
468+                                      0 ,
469+                                      NULL ,
470+                                      ofi_req -> remote_addr ,
471+                                      wc -> tag  | MTL_OFI_SYNC_SEND_ACK ,
472+                                      (void  * ) & ofi_req -> ctx ));
473+         if  (OPAL_UNLIKELY (0  >  ret )) {
468474            opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
469-                                 "%s:%d: fi_tsend failed: %zd " ,
470-                                 __FILE__ , __LINE__ , ret_length );
475+                                 "%s:%d: fi_tsend failed: %s(%zd) " ,
476+                                 __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
471477            status -> MPI_ERROR  =  OMPI_ERROR ;
472478        }
473479    } else  {
@@ -513,8 +519,8 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
513519                   struct  opal_convertor_t  * convertor ,
514520                   mca_mtl_request_t  * mtl_request )
515521{
516-     int  ret  =  OMPI_SUCCESS ;
517-     ssize_t  ret_length ;
522+     int  ompi_ret  =  OMPI_SUCCESS ;
523+     ssize_t  ret ;
518524    uint64_t  match_bits , mask_bits ;
519525    fi_addr_t  remote_addr ;
520526    ompi_proc_t  * ompi_proc  =  NULL ;
@@ -534,9 +540,12 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
534540
535541    MTL_OFI_SET_RECV_BITS (match_bits , mask_bits , comm -> c_contextid , src , tag );
536542
537-     ret  =  ompi_mtl_datatype_recv_buf (convertor , & start , & length , & free_after );
538-     if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ret )) {
539-         return  ret ;
543+     ompi_ret  =  ompi_mtl_datatype_recv_buf (convertor ,
544+                                           & start ,
545+                                           & length ,
546+                                           & free_after );
547+     if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ompi_ret )) {
548+         return  ompi_ret ;
540549    }
541550
542551    ofi_req -> type  =  OMPI_MTL_OFI_RECV ;
@@ -551,22 +560,21 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
551560    ofi_req -> remote_addr  =  remote_addr ;
552561    ofi_req -> match_bits  =  match_bits ;
553562
554-     ret_length  =  fi_trecv (ompi_mtl_ofi .ep ,
555-                           start ,
556-                           length ,
557-                           NULL ,
558-                           remote_addr ,
559-                           match_bits ,
560-                           mask_bits ,
561-                           (void  * )& ofi_req -> ctx );
562- 
563-     if  (OPAL_UNLIKELY (ret_length  <  0 )) {
563+     FI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ep ,
564+                                  start ,
565+                                  length ,
566+                                  NULL ,
567+                                  remote_addr ,
568+                                  match_bits ,
569+                                  mask_bits ,
570+                                  (void  * )& ofi_req -> ctx ));
571+     if  (OPAL_UNLIKELY (0  >  ret )) {
564572        if  (NULL  !=  ofi_req -> buffer ) {
565573            free (ofi_req -> buffer );
566574        }
567575        opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
568576                            "%s:%d: fi_trecv failed: %s(%zd)" ,
569-                             __FILE__ , __LINE__ , strerror ( errno ), ret_length );
577+                             __FILE__ , __LINE__ , fi_strerror ( - ret ), ret );
570578        return  ompi_mtl_ofi_get_error (ret );
571579    }
572580
@@ -637,12 +645,16 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
637645    bool  free_after ;
638646    struct  iovec  iov ;
639647    struct  fi_msg_tagged  msg ;
640-     int  ret ;
648+     int  ompi_ret ;
649+     ssize_t  ret ;
641650    uint64_t  msgflags  =  FI_CLAIM ;
642651
643-     ret  =  ompi_mtl_datatype_recv_buf (convertor , & start , & length , & free_after );
644-     if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ret )) {
645-         return  ret ;
652+     ompi_ret  =  ompi_mtl_datatype_recv_buf (convertor ,
653+                                           & start ,
654+                                           & length ,
655+                                           & free_after );
656+     if  (OPAL_UNLIKELY (OMPI_SUCCESS  !=  ompi_ret )) {
657+         return  ompi_ret ;
646658    }
647659
648660    ofi_req -> type  =  OMPI_MTL_OFI_RECV ;
@@ -668,12 +680,12 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
668680    msg .context  =  (void  * )& ofi_req -> ctx ;
669681    msg .data  =  0 ;
670682
671-     ret   =   fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
672-     if  (ret   <   0 ) {
683+     FI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
684+     if  (OPAL_UNLIKELY ( 0   >   ret ) ) {
673685        opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
674-                             "%s:%d: unexpected return code from fi_trecvmsg: %d " ,
675-                             __FILE__ , __LINE__ , ret );
676-         return  ompi_mtl_ofi_get_error (- ret );
686+                             "%s:%d: unexpected return code from fi_trecvmsg: %s(%zd) " ,
687+                             __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
688+         return  ompi_mtl_ofi_get_error (ret );
677689    }
678690
679691    return  OMPI_SUCCESS ;
@@ -723,7 +735,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
723735    mca_mtl_ofi_endpoint_t  * endpoint  =  NULL ;
724736    fi_addr_t  remote_proc  =  0 ;
725737    uint64_t  match_bits , mask_bits ;
726-     int  ret ;
738+     ssize_t  ret ;
727739    struct  fi_msg_tagged  msg ;
728740    uint64_t  msgflags  =  FI_PEEK ;
729741
@@ -761,18 +773,18 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
761773    ofi_req .completion_count  =  1 ;
762774    ofi_req .match_state  =  0 ;
763775
764-     ret   =   fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
765-     if  (ret   <   0   &&   - FI_ENOMSG  ==  ret ) {
776+     FI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
777+     if  (- FI_ENOMSG  ==  ret ) {
766778        /** 
767779         * The search request completed but no matching message was found. 
768780         */ 
769781        * flag  =  0 ;
770782        return  OMPI_SUCCESS ;
771-     } else  if  (ret   <   0 ) {
783+     } else  if  (OPAL_UNLIKELY ( 0   >   ret ) ) {
772784        opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
773-                             "%s:%d: unexpected return code from  fi_trecvmsg: %d " ,
774-                             __FILE__ , __LINE__ , ret );
775-         return  ompi_mtl_ofi_get_error (- ret );
785+                             "%s:%d: fi_trecvmsg failed : %s(%zd) " ,
786+                             __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
787+         return  ompi_mtl_ofi_get_error (ret );
776788    }
777789
778790    while  (0  <  ofi_req .completion_count ) {
@@ -803,7 +815,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
803815    mca_mtl_ofi_endpoint_t  * endpoint  =  NULL ;
804816    fi_addr_t  remote_proc  =  0 ;
805817    uint64_t  match_bits , mask_bits ;
806-     int  ret ;
818+     ssize_t  ret ;
807819    struct  fi_msg_tagged  msg ;
808820    uint64_t  msgflags  =  FI_PEEK  | FI_CLAIM ;
809821
@@ -846,18 +858,18 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
846858    ofi_req -> completion_count  =  1 ;
847859    ofi_req -> match_state  =  0 ;
848860
849-     ret   =   fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags );
850-     if  (ret   <   0   &&   - FI_ENOMSG  ==  ret ) {
861+     FI_RETRY_UNTIL_DONE ( fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ) );
862+     if  (- FI_ENOMSG  ==  ret ) {
851863        /** 
852864         * The search request completed but no matching message was found. 
853865         */ 
854866        * matched  =  0 ;
855867        return  OMPI_SUCCESS ;
856-     } else  if  (ret   <   0 ) {
868+     } else  if  (OPAL_UNLIKELY ( 0   >   ret ) ) {
857869        opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
858-                             "%s:%d: unexpected return code from  fi_trecvmsg: %d " ,
859-                             __FILE__ , __LINE__ , ret );
860-         return  ompi_mtl_ofi_get_error (- ret );
870+                             "%s:%d: fi_trecvmsg failed : %s(%zd) " ,
871+                             __FILE__ , __LINE__ , fi_strerror ( - ret ),  ret );
872+         return  ompi_mtl_ofi_get_error (ret );
861873    }
862874
863875    while  (0  <  ofi_req -> completion_count ) {
0 commit comments