Skip to content

Commit b01e156

Browse files
committed
ulfm: Consistency error check fixes in ishrink implementation
Signed-off-by: Lisandro Dalcin <[email protected]>
1 parent fdc618c commit b01e156

File tree

1 file changed

+22
-11
lines changed

1 file changed

+22
-11
lines changed

ompi/communicator/ft/comm_ft.c

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ struct ompi_comm_ishrink_context_t {
416416
ompi_group_t *failed_group;
417417
ompi_group_t *alive_group;
418418
ompi_group_t *alive_rgroup;
419+
int flag;
419420
double start;
420421
};
421422
typedef struct ompi_comm_ishrink_context_t ompi_comm_ishrink_context_t;
@@ -429,7 +430,6 @@ static int ompi_comm_ishrink_check_activate(ompi_comm_request_t *request);
429430
int ompi_comm_ishrink_internal(ompi_communicator_t* comm, ompi_communicator_t** newcomm, ompi_request_t** req)
430431
{
431432
int rc;
432-
int flag = 1;
433433
#if OPAL_ENABLE_DEBUG
434434
double stop;
435435
#endif
@@ -479,7 +479,8 @@ int ompi_comm_ishrink_internal(ompi_communicator_t* comm, ompi_communicator_t**
479479
* the value of flag, instead we are only using the globally consistent
480480
* return value.
481481
*/
482-
rc = comm->c_coll->coll_iagree( &flag,
482+
context->flag = 1;
483+
rc = comm->c_coll->coll_iagree( &context->flag,
483484
1,
484485
&ompi_mpi_int.dt,
485486
&ompi_mpi_op_band.op,
@@ -508,7 +509,7 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
508509
ompi_communicator_t *comm = context->comm;
509510
ompi_request_t *subreq[1];
510511
ompi_group_t *comm_group = NULL;
511-
int rc, flag = 1;
512+
int rc;
512513
#if OPAL_ENABLE_DEBUG
513514
double stop;
514515
#endif
@@ -522,13 +523,17 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
522523
rc = request->super.req_status.MPI_ERROR;
523524
if( (OMPI_SUCCESS != rc) && (MPI_ERR_PROC_FAILED != rc) ) {
524525
opal_output(0, "%s:%d Agreement failure: %d\n", __FILE__, __LINE__, rc);
526+
ompi_comm_request_return(request);
527+
OBJ_RELEASE(context->failed_group);
525528
return rc;
526529
}
527530

528531
if( MPI_ERR_PROC_FAILED == rc ) {
529532
/* previous round found more failures, redo */
533+
OBJ_RELEASE(context->failed_group);
530534
request->super.req_status.MPI_ERROR = MPI_SUCCESS;
531-
rc = comm->c_coll->coll_iagree( &flag,
535+
context->flag = 1;
536+
rc = comm->c_coll->coll_iagree( &context->flag,
532537
1,
533538
&ompi_mpi_int.dt,
534539
&ompi_mpi_op_band.op,
@@ -575,7 +580,6 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
575580
}
576581
}
577582
OBJ_RELEASE(context->failed_group);
578-
context->failed_group = NULL;
579583

580584
rc = ompi_comm_set_nb( context->newcomm, /* new comm */
581585
comm, /* old comm */
@@ -614,15 +618,16 @@ static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request) {
614618

615619
/* cleanup temporary groups */
616620
OBJ_RELEASE(context->alive_group);
617-
context->alive_group = NULL;
618621
if( NULL != context->alive_rgroup ) {
619622
OBJ_RELEASE(context->alive_rgroup);
620623
}
621-
context->alive_rgroup = NULL;
622624

623625
/* check errors in prior step */
624-
if( NULL == *context->newcomm ) {
625-
rc = MPI_ERR_INTERN;
626+
rc = request->super.req_status.MPI_ERROR;
627+
if( OMPI_SUCCESS != rc ) {
628+
opal_output_verbose(1, ompi_ftmpi_output_handle,
629+
"%s ompi: comm_ishrink: Construction failed with error %d",
630+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), rc);
626631
ompi_comm_request_return(request);
627632
OBJ_RELEASE(*context->newcomm);
628633
return rc;
@@ -719,6 +724,7 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
719724
mode,
720725
subreq );
721726
if( OMPI_SUCCESS != rc ) {
727+
ompi_comm_request_return(request);
722728
OBJ_RELEASE(*context->newcomm);
723729
return rc;
724730
}
@@ -729,18 +735,23 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
729735
}
730736

731737
static int ompi_comm_ishrink_check_activate(ompi_comm_request_t *request) {
738+
ompi_comm_ishrink_context_t *context =
739+
(ompi_comm_ishrink_context_t *)request->context;
732740
int rc;
733741
#if OPAL_ENABLE_DEBUG
734742
double stop;
735743
#endif
736744

737745
rc = request->super.req_status.MPI_ERROR;
738746
if( OMPI_SUCCESS != rc ) {
747+
opal_output_verbose(1, ompi_ftmpi_output_handle,
748+
"%s ompi: comm_ishrink: Activation failed with error %d",
749+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), rc);
750+
ompi_comm_request_return(request);
751+
OBJ_RELEASE(*context->newcomm);
739752
return rc;
740753
}
741754
#if OPAL_ENABLE_DEBUG
742-
ompi_comm_ishrink_context_t *context =
743-
(ompi_comm_ishrink_context_t *)request->context;
744755
stop = MPI_Wtime();
745756
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
746757
"%s ompi: comm_ishrink: COLL SELECT: %g seconds\n",

0 commit comments

Comments
 (0)