@@ -416,6 +416,7 @@ struct ompi_comm_ishrink_context_t {
416
416
ompi_group_t * failed_group ;
417
417
ompi_group_t * alive_group ;
418
418
ompi_group_t * alive_rgroup ;
419
+ int flag ;
419
420
double start ;
420
421
};
421
422
typedef struct ompi_comm_ishrink_context_t ompi_comm_ishrink_context_t ;
@@ -429,7 +430,6 @@ static int ompi_comm_ishrink_check_activate(ompi_comm_request_t *request);
429
430
int ompi_comm_ishrink_internal (ompi_communicator_t * comm , ompi_communicator_t * * newcomm , ompi_request_t * * req )
430
431
{
431
432
int rc ;
432
- int flag = 1 ;
433
433
#if OPAL_ENABLE_DEBUG
434
434
double stop ;
435
435
#endif
@@ -479,7 +479,8 @@ int ompi_comm_ishrink_internal(ompi_communicator_t* comm, ompi_communicator_t**
479
479
* the value of flag, instead we are only using the globally consistent
480
480
* return value.
481
481
*/
482
- rc = comm -> c_coll -> coll_iagree ( & flag ,
482
+ context -> flag = 1 ;
483
+ rc = comm -> c_coll -> coll_iagree ( & context -> flag ,
483
484
1 ,
484
485
& ompi_mpi_int .dt ,
485
486
& ompi_mpi_op_band .op ,
@@ -508,7 +509,7 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
508
509
ompi_communicator_t * comm = context -> comm ;
509
510
ompi_request_t * subreq [1 ];
510
511
ompi_group_t * comm_group = NULL ;
511
- int rc , flag = 1 ;
512
+ int rc ;
512
513
#if OPAL_ENABLE_DEBUG
513
514
double stop ;
514
515
#endif
@@ -522,13 +523,17 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
522
523
rc = request -> super .req_status .MPI_ERROR ;
523
524
if ( (OMPI_SUCCESS != rc ) && (MPI_ERR_PROC_FAILED != rc ) ) {
524
525
opal_output (0 , "%s:%d Agreement failure: %d\n" , __FILE__ , __LINE__ , rc );
526
+ ompi_comm_request_return (request );
527
+ OBJ_RELEASE (context -> failed_group );
525
528
return rc ;
526
529
}
527
530
528
531
if ( MPI_ERR_PROC_FAILED == rc ) {
529
532
/* previous round found more failures, redo */
533
+ OBJ_RELEASE (context -> failed_group );
530
534
request -> super .req_status .MPI_ERROR = MPI_SUCCESS ;
531
- rc = comm -> c_coll -> coll_iagree ( & flag ,
535
+ context -> flag = 1 ;
536
+ rc = comm -> c_coll -> coll_iagree ( & context -> flag ,
532
537
1 ,
533
538
& ompi_mpi_int .dt ,
534
539
& ompi_mpi_op_band .op ,
@@ -575,7 +580,6 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
575
580
}
576
581
}
577
582
OBJ_RELEASE (context -> failed_group );
578
- context -> failed_group = NULL ;
579
583
580
584
rc = ompi_comm_set_nb ( context -> newcomm , /* new comm */
581
585
comm , /* old comm */
@@ -614,15 +618,16 @@ static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request) {
614
618
615
619
/* cleanup temporary groups */
616
620
OBJ_RELEASE (context -> alive_group );
617
- context -> alive_group = NULL ;
618
621
if ( NULL != context -> alive_rgroup ) {
619
622
OBJ_RELEASE (context -> alive_rgroup );
620
623
}
621
- context -> alive_rgroup = NULL ;
622
624
623
625
/* check errors in prior step */
624
- if ( NULL == * context -> newcomm ) {
625
- rc = MPI_ERR_INTERN ;
626
+ rc = request -> super .req_status .MPI_ERROR ;
627
+ if ( OMPI_SUCCESS != rc ) {
628
+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
629
+ "%s ompi: comm_ishrink: Construction failed with error %d" ,
630
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ), rc );
626
631
ompi_comm_request_return (request );
627
632
OBJ_RELEASE (* context -> newcomm );
628
633
return rc ;
@@ -719,6 +724,7 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
719
724
mode ,
720
725
subreq );
721
726
if ( OMPI_SUCCESS != rc ) {
727
+ ompi_comm_request_return (request );
722
728
OBJ_RELEASE (* context -> newcomm );
723
729
return rc ;
724
730
}
@@ -729,18 +735,23 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
729
735
}
730
736
731
737
static int ompi_comm_ishrink_check_activate (ompi_comm_request_t * request ) {
738
+ ompi_comm_ishrink_context_t * context =
739
+ (ompi_comm_ishrink_context_t * )request -> context ;
732
740
int rc ;
733
741
#if OPAL_ENABLE_DEBUG
734
742
double stop ;
735
743
#endif
736
744
737
745
rc = request -> super .req_status .MPI_ERROR ;
738
746
if ( OMPI_SUCCESS != rc ) {
747
+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
748
+ "%s ompi: comm_ishrink: Activation failed with error %d" ,
749
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ), rc );
750
+ ompi_comm_request_return (request );
751
+ OBJ_RELEASE (* context -> newcomm );
739
752
return rc ;
740
753
}
741
754
#if OPAL_ENABLE_DEBUG
742
- ompi_comm_ishrink_context_t * context =
743
- (ompi_comm_ishrink_context_t * )request -> context ;
744
755
stop = MPI_Wtime ();
745
756
OPAL_OUTPUT_VERBOSE ((10 , ompi_ftmpi_output_handle ,
746
757
"%s ompi: comm_ishrink: COLL SELECT: %g seconds\n" ,
0 commit comments