@@ -113,6 +113,7 @@ int used_comm = -1, n_failed_imgs=0, error_called=0;
113
113
int * ranks_gc ,* ranks_gf ; //to be returned by failed images
114
114
MPI_Errhandler errh ,errh_w ;
115
115
int completed = 0 ,tmp_lock ;
116
+ int * stopped_images ;
116
117
117
118
static int cmpfunc (const void * a , const void * b )
118
119
{
@@ -501,7 +502,8 @@ PREFIX (init) (int *argc, char ***argv)
501
502
502
503
ranks_gf = (int * )malloc (caf_num_images * sizeof (int ));
503
504
ranks_gc = (int * )malloc (caf_num_images * sizeof (int ));
504
-
505
+ stopped_images = (int * )calloc (caf_num_images , sizeof (int ));
506
+
505
507
#if MPI_VERSION >= 3
506
508
MPI_Info_create (& mpi_info_same_size );
507
509
MPI_Info_set (mpi_info_same_size , "same_size" , "true" );
@@ -533,7 +535,10 @@ PREFIX (finalize) (void)
533
535
* img_status = STAT_STOPPED_IMAGE ; /* GFC_STAT_STOPPED_IMAGE = 6000 */
534
536
MPI_Win_sync (* stat_tok );
535
537
536
- completed = 1 ;
538
+ MPIX_Comm_revoke (CAF_COMM_WORLD );
539
+ communicator_shrink (& CAF_COMM_WORLD );
540
+
541
+ MPI_Barrier (stopped_comm );
537
542
538
543
while (caf_static_list != NULL )
539
544
{
@@ -561,24 +566,16 @@ PREFIX (finalize) (void)
561
566
MPI_Info_free (& mpi_info_same_size );
562
567
#endif // MPI_VERSION
563
568
564
- //MPI_Comm_free(&CAF_COMM_WORLD);
565
-
566
- printf ("Before revoke\n" );
569
+ /* MPI_Comm_free(&CAF_COMM_WORLD); */
567
570
568
- MPIX_Comm_revoke (CAF_COMM_WORLD );
569
- printf ("After revoke\n" );
570
- MPI_Test (& stopped_req ,& flag ,MPI_STATUS_IGNORE );
571
- communicator_shrink (& stopped_comm );
572
- MPI_Barrier (stopped_comm );
573
- printf ("After barrier\n" );
574
571
/* Only call Finalize if CAF runtime Initialized MPI. */
575
572
if (caf_owns_mpi ) {
576
573
MPI_Finalize ();
577
574
}
578
575
pthread_mutex_lock (& lock_am );
579
576
caf_is_finalized = 1 ;
580
577
pthread_mutex_unlock (& lock_am );
581
- printf ("finalizing\n" );
578
+ /* printf("finalizing\n"); */
582
579
exit (0 );
583
580
}
584
581
@@ -614,6 +611,8 @@ int communicator_shrink(MPI_Comm *comm)
614
611
615
612
/* Split does the magic: removing spare processes and reordering ranks
616
613
* so that all surviving processes remain at their former place */
614
+ if (* img_status == STAT_STOPPED_IMAGE )
615
+ crank = -1 ;
617
616
rc = MPI_Comm_split (shrunk , crank < 0 ?MPI_UNDEFINED :1 , crank , newcomm );
618
617
619
618
/* Split or some of the communications above may have failed if
@@ -708,10 +707,11 @@ void *
708
707
MPI_Win_flush (caf_this_image - 1 , * p );
709
708
# endif // CAF_MPI_LOCK_UNLOCK
710
709
free (init_array );
711
- MPI_Barrier (CAF_COMM_WORLD );
712
710
/* PREFIX(sync_all) (NULL,NULL,0); */
713
711
}
714
712
713
+ MPI_Barrier (CAF_COMM_WORLD );
714
+
715
715
if (error_called == 1 )
716
716
{
717
717
communicator_shrink (& CAF_COMM_WORLD );
0 commit comments