Skip to content

Commit c945769

Browse files
authored
Merge pull request #5557 from karasevb/v3.0.x_pmix_fence_status
v3.0.x/pmix: added check for pmix fence status
2 parents 7fdca18 + 20ad9ad commit c945769

File tree

10 files changed

+136
-34
lines changed

10 files changed

+136
-34
lines changed

ompi/dpm/dpm.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,12 @@ int ompi_dpm_disconnect(ompi_communicator_t *comm)
596596
} else {
597597
ret = opal_pmix.disconnect(&coll);
598598
}
599+
if (OMPI_SUCCESS != ret) {
600+
OMPI_ERROR_LOG(ret);
601+
OPAL_LIST_DESTRUCT(&coll);
602+
return ret;
603+
}
604+
599605
OPAL_LIST_DESTRUCT(&coll);
600606

601607
return ret;

ompi/mca/bml/r2/bml_r2_ft.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ int mca_bml_r2_ft_event(int state)
155155
* Barrier to make all processes have been successfully restarted before
156156
* we try to remove some restart only files.
157157
*/
158-
opal_pmix.fence(NULL, 0);
158+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
159+
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
160+
return ret;
161+
}
159162

160163
/*
161164
* Re-open the BTL framework to get the full list of components.
@@ -224,7 +227,10 @@ int mca_bml_r2_ft_event(int state)
224227
* Barrier to make all processes have been successfully restarted before
225228
* we try to remove some restart only files.
226229
*/
227-
opal_pmix.fence(NULL, 0);
230+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
231+
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
232+
return ret;
233+
}
228234

229235
/*
230236
* Re-open the BTL framework to get the full list of components.

ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3028,7 +3028,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
30283028

30293029
if( opal_cr_timing_barrier_enabled ) {
30303030
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
3031-
opal_pmix.fence(NULL, 0);
3031+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
3032+
exit_status = ret;
3033+
goto DONE;
3034+
}
30323035
}
30333036
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
30343037

@@ -3096,7 +3099,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
30963099

30973100
if( opal_cr_timing_barrier_enabled ) {
30983101
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
3099-
opal_pmix.fence(NULL, 0);
3102+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
3103+
exit_status = ret;
3104+
goto DONE;
3105+
}
31003106
}
31013107
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
31023108
}
@@ -6207,14 +6213,16 @@ static void clear_timers(void) {
62076213
static void display_all_timers(int state) {
62086214
bool report_ready = false;
62096215
double barrier_start, barrier_stop;
6210-
int i;
6216+
int i, ret;
62116217

62126218
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
62136219
if( 2 > timing_enabled ) {
62146220
return;
62156221
}
62166222
else if( 2 == timing_enabled ) {
6217-
opal_pmix.fence(NULL, 0);
6223+
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
6224+
OPAL_ERROR_LOG(ret);
6225+
}
62186226
return;
62196227
}
62206228
}
@@ -6235,7 +6243,9 @@ static void display_all_timers(int state) {
62356243

62366244
if( timing_enabled >= 2) {
62376245
barrier_start = get_time();
6238-
opal_pmix.fence(NULL, 0);
6246+
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
6247+
OPAL_ERROR_LOG(ret);
6248+
}
62396249
barrier_stop = get_time();
62406250
opal_output(0,
62416251
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",

ompi/mca/pml/bfo/pml_bfo.c

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,10 @@ int mca_pml_bfo_ft_event( int state )
666666
if(OPAL_CRS_CHECKPOINT == state) {
667667
if( opal_cr_timing_barrier_enabled ) {
668668
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
669-
opal_pmix.fence(NULL, 0);
669+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
670+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
671+
return ret;
672+
}
670673
}
671674

672675
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
@@ -677,7 +680,10 @@ int mca_pml_bfo_ft_event( int state )
677680
if( !first_continue_pass ) {
678681
if( opal_cr_timing_barrier_enabled ) {
679682
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
680-
opal_pmix.fence(NULL, 0);
683+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
684+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
685+
return ret;
686+
}
681687
}
682688
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
683689
}
@@ -777,7 +783,10 @@ int mca_pml_bfo_ft_event( int state )
777783
if( !first_continue_pass ) {
778784
if( opal_cr_timing_barrier_enabled ) {
779785
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
780-
opal_pmix.fence(NULL, 0);
786+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
787+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
788+
return ret;
789+
}
781790
}
782791
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
783792
}
@@ -787,7 +796,10 @@ int mca_pml_bfo_ft_event( int state )
787796
* Exchange the modex information once again.
788797
* BTLs will have republished their modex information.
789798
*/
790-
opal_pmix.fence(NULL, 0);
799+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
800+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
801+
return ret;
802+
}
791803

792804
/*
793805
* Startup the PML stack now that the modex is running again
@@ -799,7 +811,10 @@ int mca_pml_bfo_ft_event( int state )
799811
}
800812

801813
/* Is this barrier necessary ? JJH */
802-
opal_pmix.fence(NULL, 0);
814+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
815+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
816+
return ret;
817+
}
803818

804819
if( NULL != procs ) {
805820
for(p = 0; p < (int)num_procs; ++p) {
@@ -812,7 +827,10 @@ int mca_pml_bfo_ft_event( int state )
812827
if( !first_continue_pass ) {
813828
if( opal_cr_timing_barrier_enabled ) {
814829
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
815-
opal_pmix.fence(NULL, 0);
830+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
831+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
832+
return ret;
833+
}
816834
}
817835
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
818836
}
@@ -825,7 +843,10 @@ int mca_pml_bfo_ft_event( int state )
825843
* Exchange the modex information once again.
826844
* BTLs will have republished their modex information.
827845
*/
828-
opal_pmix.fence(NULL, 0);
846+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
847+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
848+
return ret;
849+
}
829850

830851
/*
831852
* Startup the PML stack now that the modex is running again
@@ -837,7 +858,10 @@ int mca_pml_bfo_ft_event( int state )
837858
}
838859

839860
/* Is this barrier necessary ? JJH */
840-
opal_pmix.fence(NULL, 0);
861+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
862+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
863+
return ret;
864+
}
841865

842866
if( NULL != procs ) {
843867
for(p = 0; p < (int)num_procs; ++p) {

ompi/mca/pml/ob1/pml_ob1.c

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -797,7 +797,10 @@ int mca_pml_ob1_ft_event( int state )
797797
if(OPAL_CRS_CHECKPOINT == state) {
798798
if( opal_cr_timing_barrier_enabled ) {
799799
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
800-
opal_pmix.fence(NULL, 0);
800+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
801+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
802+
return ret;
803+
}
801804
}
802805

803806
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
@@ -808,7 +811,10 @@ int mca_pml_ob1_ft_event( int state )
808811
if( !first_continue_pass ) {
809812
if( opal_cr_timing_barrier_enabled ) {
810813
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
811-
opal_pmix.fence(NULL, 0);
814+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
815+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
816+
return ret;
817+
}
812818
}
813819
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
814820
}
@@ -908,13 +914,19 @@ int mca_pml_ob1_ft_event( int state )
908914
if( !first_continue_pass ) {
909915
if( opal_cr_timing_barrier_enabled ) {
910916
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
911-
opal_pmix.fence(NULL, 0);
917+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
918+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
919+
return ret;
920+
}
912921
}
913922
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
914923
}
915924

916925
if (opal_cr_continue_like_restart && !first_continue_pass) {
917-
opal_pmix.fence(NULL, 0);
926+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
927+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
928+
return ret;
929+
}
918930

919931
/*
920932
* Startup the PML stack now that the modex is running again
@@ -926,7 +938,10 @@ int mca_pml_ob1_ft_event( int state )
926938
}
927939

928940
/* Is this barrier necessary ? JJH */
929-
opal_pmix.fence(NULL, 0);
941+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
942+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
943+
return ret;
944+
}
930945

931946
if( NULL != procs ) {
932947
for(p = 0; p < (int)num_procs; ++p) {
@@ -939,7 +954,10 @@ int mca_pml_ob1_ft_event( int state )
939954
if( !first_continue_pass ) {
940955
if( opal_cr_timing_barrier_enabled ) {
941956
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
942-
opal_pmix.fence(NULL, 0);
957+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
958+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
959+
return ret;
960+
}
943961
}
944962
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
945963
}
@@ -952,7 +970,10 @@ int mca_pml_ob1_ft_event( int state )
952970
* Exchange the modex information once again.
953971
* BTLs will have republished their modex information.
954972
*/
955-
opal_pmix.fence(NULL, 0);
973+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
974+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
975+
return ret;
976+
}
956977

957978
/*
958979
* Startup the PML stack now that the modex is running again
@@ -964,7 +985,10 @@ int mca_pml_ob1_ft_event( int state )
964985
}
965986

966987
/* Is this barrier necessary ? JJH */
967-
opal_pmix.fence(NULL, 0);
988+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
989+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
990+
return ret;
991+
}
968992

969993
if( NULL != procs ) {
970994
for(p = 0; p < (int)num_procs; ++p) {

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ int mca_pml_yalla_add_procs(struct ompi_proc_t **procs, size_t nprocs)
265265
int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
266266
{
267267
size_t i;
268+
int ret;
268269

269270
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
270271
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
@@ -276,7 +277,9 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
276277
PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name));
277278
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
278279
}
279-
opal_pmix.fence(NULL, 0);
280+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
281+
return ret;
282+
}
280283
return OMPI_SUCCESS;
281284
}
282285

ompi/runtime/ompi_mpi_finalize.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,13 @@ int ompi_mpi_finalize(void)
259259
* communications/actions to complete. See
260260
* https://github.com/open-mpi/ompi/issues/1576 for the
261261
* original bug report. */
262-
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
262+
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
263+
(void*)&active))) {
264+
OMPI_ERROR_LOG(ret);
265+
/* Reset the active flag to false, to avoid waiting for
266+
* completion when the fence was failed. */
267+
active = false;
268+
}
263269
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
264270
} else {
265271
/* However, we cannot guarantee that the provided PMIx has
@@ -270,7 +276,9 @@ int ompi_mpi_finalize(void)
270276
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
271277
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
272278

273-
opal_pmix.fence(NULL, 0);
279+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
280+
OMPI_ERROR_LOG(ret);
281+
}
274282
}
275283
}
276284

ompi/runtime/ompi_mpi_init.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -661,8 +661,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
661661
if (!opal_pmix_base_async_modex) {
662662
if (NULL != opal_pmix.fence_nb) {
663663
active = true;
664-
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
665-
fence_release, (void*)&active);
664+
if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL,
665+
opal_pmix_collect_all_data,
666+
fence_release, (void*)&active))) {
667+
error = "opal_pmix.fence_nb() failed";
668+
goto error;
669+
}
666670
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
667671
} else {
668672
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
@@ -838,11 +842,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
838842
if (!ompi_async_mpi_init) {
839843
active = true;
840844
if (NULL != opal_pmix.fence_nb) {
841-
opal_pmix.fence_nb(NULL, false,
842-
fence_release, (void*)&active);
845+
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false,
846+
fence_release, (void*)&active))) {
847+
error = "opal_pmix.fence_nb() failed";
848+
goto error;
849+
}
843850
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
844851
} else {
845-
opal_pmix.fence(NULL, false);
852+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) {
853+
error = "opal_pmix.fence() failed";
854+
goto error;
855+
}
846856
}
847857
}
848858

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,10 @@ static int rte_init(void)
439439
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
440440
/* need to commit the data before we fence */
441441
opal_pmix.commit();
442-
opal_pmix.fence(NULL, 0);
442+
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
443+
error = "opal_pmix.fence() failed";
444+
goto error;
445+
}
443446
}
444447

445448
return ORTE_SUCCESS;

orte/mca/snapc/full/snapc_full_app.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,11 @@ int app_coord_init()
150150
"app) Startup Barrier..."));
151151
}
152152

153-
opal_pmix.fence(NULL, 0);
153+
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
154+
ORTE_ERROR_LOG(ret);
155+
exit_status = ret;
156+
goto cleanup;
157+
}
154158

155159
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
156160
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
@@ -216,7 +220,11 @@ int app_coord_finalize()
216220
"app) Shutdown Barrier..."));
217221
}
218222

219-
opal_pmix.fence(NULL, 0);
223+
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
224+
ORTE_ERROR_LOG(ret);
225+
exit_status = ret;
226+
goto cleanup;
227+
}
220228

221229
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
222230
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,

0 commit comments

Comments
 (0)