2
2
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
3
3
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
4
4
* reserved.
5
- * Copyright (c) 2019-2023 Triad National Security, LLC. All rights
5
+ * Copyright (c) 2019-2024 Triad National Security, LLC. All rights
6
6
* reserved.
7
7
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved.
8
8
* reserved.
@@ -266,25 +266,6 @@ ompi_mtl_ofi_progress(void)
266
266
return count ;
267
267
}
268
268
269
- /**
270
- * When attempting to execute an OFI operation we need to handle
271
- * resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
272
- * the OFI mtl will attempt to progress any pending Completion Queue
273
- * events that may prevent additional operations to be enqueued.
274
- * If the call to ofi progress is successful, then the function call
275
- * will be retried.
276
- */
277
- #define MTL_OFI_RETRY_UNTIL_DONE (FUNC , RETURN ) \
278
- do { \
279
- do { \
280
- RETURN = FUNC; \
281
- if (OPAL_LIKELY(0 == RETURN)) {break;} \
282
- if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
283
- ompi_mtl_ofi_progress(); \
284
- } \
285
- } while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
286
- } while (0);
287
-
288
269
#define MTL_OFI_LOG_FI_ERR (err , string ) \
289
270
do { \
290
271
opal_output_verbose(1, opal_common_ofi.output, \
@@ -636,12 +617,12 @@ ompi_mtl_ofi_post_recv_excid_buffer(bool blocking, struct ompi_communicator_t *c
636
617
ofi_req -> completion_count = 1 ;
637
618
ofi_req -> comm = comm ;
638
619
639
- MTL_OFI_RETRY_UNTIL_DONE (fi_recv (ompi_mtl_ofi .ofi_ctxt [0 ].rx_ep ,
640
- start ,
641
- length ,
642
- NULL ,
643
- FI_ADDR_UNSPEC ,
644
- (void * )& ofi_req -> ctx ), ret );
620
+ OFI_RETRY_UNTIL_DONE (fi_recv (ompi_mtl_ofi .ofi_ctxt [0 ].rx_ep ,
621
+ start ,
622
+ length ,
623
+ NULL ,
624
+ FI_ADDR_UNSPEC ,
625
+ (void * )& ofi_req -> ctx ), ret );
645
626
if (OPAL_UNLIKELY (0 > ret )) {
646
627
if (NULL != ofi_req -> buffer ) {
647
628
free (ofi_req -> buffer );
@@ -689,14 +670,14 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
689
670
690
671
ofi_req -> completion_count += 1 ;
691
672
692
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
693
- NULL ,
694
- 0 ,
695
- NULL ,
696
- * src_addr ,
697
- * match_bits | ompi_mtl_ofi .sync_send_ack ,
698
- 0 , /* Exact match, no ignore bits */
699
- (void * ) & ack_req -> ctx ), ret );
673
+ OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
674
+ NULL ,
675
+ 0 ,
676
+ NULL ,
677
+ * src_addr ,
678
+ * match_bits | ompi_mtl_ofi .sync_send_ack ,
679
+ 0 , /* Exact match, no ignore bits */
680
+ (void * ) & ack_req -> ctx ), ret );
700
681
if (OPAL_UNLIKELY (0 > ret )) {
701
682
opal_output_verbose (1 , opal_common_ofi .output ,
702
683
"%s:%d: fi_trecv failed: %s(%zd)" ,
@@ -788,16 +769,16 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
788
769
789
770
if (ompi_mtl_ofi .max_inject_size >= length ) {
790
771
if (ofi_cq_data ) {
791
- MTL_OFI_RETRY_UNTIL_DONE (fi_injectdata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
792
- start ,
793
- length ,
794
- comm -> c_my_rank ,
795
- sep_peer_fiaddr ), ret );
772
+ OFI_RETRY_UNTIL_DONE (fi_injectdata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
773
+ start ,
774
+ length ,
775
+ comm -> c_my_rank ,
776
+ sep_peer_fiaddr ), ret );
796
777
} else {
797
- MTL_OFI_RETRY_UNTIL_DONE (fi_inject (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
798
- start ,
799
- length ,
800
- sep_peer_fiaddr ), ret );
778
+ OFI_RETRY_UNTIL_DONE (fi_inject (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
779
+ start ,
780
+ length ,
781
+ sep_peer_fiaddr ), ret );
801
782
}
802
783
if (OPAL_UNLIKELY (0 > ret )) {
803
784
MTL_OFI_LOG_FI_ERR (ret ,
@@ -808,20 +789,20 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
808
789
} else {
809
790
ofi_req -> completion_count = 1 ;
810
791
if (ofi_cq_data ) {
811
- MTL_OFI_RETRY_UNTIL_DONE (fi_senddata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
812
- start ,
813
- length ,
814
- NULL ,
815
- comm -> c_my_rank ,
816
- sep_peer_fiaddr ,
817
- (void * ) & ofi_req -> ctx ), ret );
792
+ OFI_RETRY_UNTIL_DONE (fi_senddata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
793
+ start ,
794
+ length ,
795
+ NULL ,
796
+ comm -> c_my_rank ,
797
+ sep_peer_fiaddr ,
798
+ (void * ) & ofi_req -> ctx ), ret );
818
799
} else {
819
- MTL_OFI_RETRY_UNTIL_DONE (fi_send (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
820
- start ,
821
- length ,
822
- NULL ,
823
- sep_peer_fiaddr ,
824
- (void * ) & ofi_req -> ctx ), ret );
800
+ OFI_RETRY_UNTIL_DONE (fi_send (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
801
+ start ,
802
+ length ,
803
+ NULL ,
804
+ sep_peer_fiaddr ,
805
+ (void * ) & ofi_req -> ctx ), ret );
825
806
}
826
807
if (OPAL_UNLIKELY (0 > ret )) {
827
808
MTL_OFI_LOG_FI_ERR (ret ,
@@ -952,14 +933,14 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
952
933
if (!(convertor -> flags & CONVERTOR_ACCELERATOR )
953
934
&& (ompi_mtl_ofi .max_inject_size >= length )) {
954
935
if (ofi_cq_data ) {
955
- MTL_OFI_RETRY_UNTIL_DONE (fi_tinjectdata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
956
- start ,
957
- length ,
958
- comm -> c_my_rank ,
959
- sep_peer_fiaddr ,
960
- match_bits ), ret );
936
+ OFI_RETRY_UNTIL_DONE (fi_tinjectdata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
937
+ start ,
938
+ length ,
939
+ comm -> c_my_rank ,
940
+ sep_peer_fiaddr ,
941
+ match_bits ), ret );
961
942
} else {
962
- MTL_OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
943
+ OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
963
944
start ,
964
945
length ,
965
946
sep_peer_fiaddr ,
@@ -984,16 +965,16 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
984
965
}
985
966
ofi_req .completion_count += 1 ;
986
967
if (ofi_cq_data ) {
987
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
988
- start ,
989
- length ,
990
- (NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
991
- comm -> c_my_rank ,
992
- sep_peer_fiaddr ,
993
- match_bits ,
994
- (void * ) & ofi_req .ctx ), ret );
968
+ OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
969
+ start ,
970
+ length ,
971
+ (NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
972
+ comm -> c_my_rank ,
973
+ sep_peer_fiaddr ,
974
+ match_bits ,
975
+ (void * ) & ofi_req .ctx ), ret );
995
976
} else {
996
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
977
+ OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
997
978
start ,
998
979
length ,
999
980
(NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
@@ -1092,8 +1073,8 @@ ompi_mtl_ofi_gen_ssend_ack(struct fi_cq_tagged_entry *wc,
1092
1073
tagged_msg .context = NULL ;
1093
1074
tagged_msg .data = 0 ;
1094
1075
1095
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1096
- & tagged_msg , 0 ), ret );
1076
+ OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1077
+ & tagged_msg , 0 ), ret );
1097
1078
if (OPAL_UNLIKELY (0 > ret )) {
1098
1079
MTL_OFI_LOG_FI_ERR (ret , "fi_tsendmsg failed during ompi_mtl_ofi_gen_ssend_ack" );
1099
1080
ret = OMPI_ERROR ;
@@ -1238,16 +1219,16 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
1238
1219
1239
1220
1240
1221
if (ofi_cq_data ) {
1241
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1242
- start ,
1243
- length ,
1244
- (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
1245
- comm -> c_my_rank ,
1246
- sep_peer_fiaddr ,
1247
- match_bits ,
1248
- (void * ) & ofi_req -> ctx ), ret );
1222
+ OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1223
+ start ,
1224
+ length ,
1225
+ (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
1226
+ comm -> c_my_rank ,
1227
+ sep_peer_fiaddr ,
1228
+ match_bits ,
1229
+ (void * ) & ofi_req -> ctx ), ret );
1249
1230
} else {
1250
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1231
+ OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1251
1232
start ,
1252
1233
length ,
1253
1234
(NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
@@ -1456,7 +1437,7 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl,
1456
1437
return ompi_ret ;
1457
1438
}
1458
1439
1459
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
1440
+ OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
1460
1441
start ,
1461
1442
length ,
1462
1443
(NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
@@ -1608,7 +1589,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
1608
1589
msg .context = (void * )& ofi_req -> ctx ;
1609
1590
msg .data = 0 ;
1610
1591
1611
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1592
+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1612
1593
if (OPAL_UNLIKELY (0 > ret )) {
1613
1594
ompi_mtl_ofi_deregister_and_free_buffer (ofi_req );
1614
1595
MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
@@ -1740,7 +1721,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
1740
1721
ofi_req .completion_count = 1 ;
1741
1722
ofi_req .match_state = 0 ;
1742
1723
1743
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1724
+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1744
1725
if (OPAL_UNLIKELY (0 > ret )) {
1745
1726
MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
1746
1727
return ompi_mtl_ofi_get_error (ret );
@@ -1849,7 +1830,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
1849
1830
ofi_req -> match_state = 0 ;
1850
1831
ofi_req -> mask_bits = mask_bits ;
1851
1832
1852
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1833
+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1853
1834
if (OPAL_UNLIKELY (0 > ret )) {
1854
1835
MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
1855
1836
free (ofi_req );
0 commit comments