@@ -75,8 +75,8 @@ static int caf_is_finalized = 0;
75
75
/*Sync image part*/
76
76
77
77
static int * orders ;
78
- MPI_Request * handlers ;
79
78
static int * images_full ;
79
+ MPI_Request * sync_handles ;
80
80
static int * arrived ;
81
81
82
82
/* Pending puts */
@@ -96,8 +96,8 @@ caf_static_t *caf_tot = NULL;
96
96
97
97
/* Image status variable */
98
98
99
- static int * img_status = NULL ;
100
- MPI_Win * stat_tok ;
99
+ static int img_status = 0 ;
100
+ static MPI_Win * stat_tok ;
101
101
102
102
/* Active messages variables */
103
103
@@ -400,27 +400,31 @@ PREFIX (init) (int *argc, char ***argv)
400
400
orders = calloc (caf_num_images , sizeof (int ));
401
401
arrived = calloc (caf_num_images , sizeof (int ));
402
402
403
- handlers = malloc (caf_num_images * sizeof (MPI_Request ));
403
+ sync_handles = malloc (caf_num_images * sizeof (MPI_Request ));
404
404
405
405
stat_tok = malloc (sizeof (MPI_Win ));
406
406
407
407
#if MPI_VERSION >= 3
408
408
MPI_Info_create (& mpi_info_same_size );
409
409
MPI_Info_set (mpi_info_same_size , "same_size" , "true" );
410
410
/* Setting img_status */
411
- MPI_Win_allocate ( sizeof (int ), 1 , mpi_info_same_size , CAF_COMM_WORLD , & img_status , stat_tok );
411
+ MPI_Win_create ( & img_status , sizeof (int ), 1 , mpi_info_same_size , CAF_COMM_WORLD , stat_tok );
412
412
# ifndef CAF_MPI_LOCK_UNLOCK
413
413
MPI_Win_lock_all (MPI_MODE_NOCHECK , * stat_tok );
414
414
# endif // CAF_MPI_LOCK_UNLOCK
415
415
#else
416
416
MPI_Alloc_mem (sizeof (int ), MPI_INFO_NULL , & img_status , stat_tok );
417
417
MPI_Win_create (img_status , sizeof (int ), 1 , MPI_INFO_NULL , CAF_COMM_WORLD , stat_tok );
418
418
#endif // MPI_VERSION
419
- * img_status = 0 ;
420
419
}
421
420
/* MPI_Barrier(CAF_COMM_WORLD); */
422
421
}
423
422
423
+ /* Forward declaration of sync_images. */
424
+
425
+ void
426
+ sync_images_internal (int count , int images [], int * stat , char * errmsg ,
427
+ int errmsg_len , bool internal );
424
428
425
429
/* Finalize coarray program. */
426
430
@@ -431,10 +435,17 @@ _gfortran_caf_finalize (void)
431
435
PREFIX (finalize ) (void )
432
436
#endif
433
437
{
434
- * img_status = STAT_STOPPED_IMAGE ; /* GFC_STAT_STOPPED_IMAGE = 6000 */
435
- MPI_Win_sync (* stat_tok );
438
+ /* For future security enclose setting img_status in a lock. */
439
+ CAF_Win_lock (MPI_LOCK_EXCLUSIVE , caf_this_image - 1 , * stat_tok );
440
+ img_status = STAT_STOPPED_IMAGE ; /* GFC_STAT_STOPPED_IMAGE = 6000 */
441
+ CAF_Win_unlock (caf_this_image - 1 , * stat_tok );
436
442
437
- MPI_Barrier (CAF_COMM_WORLD );
443
+ /* Announce to all other images, that this one is stopped. */
444
+ for (int i = 0 ; i < caf_num_images - 1 ; ++ i )
445
+ MPI_Send (& img_status , 1 , MPI_INT , images_full [i ] - 1 , 0 , CAF_COMM_WORLD );
446
+
447
+ /* Add a conventional barrier to prevent images from quitting to early. */
448
+ MPI_Barrier (CAF_COMM_WORLD );
438
449
439
450
while (caf_static_list != NULL )
440
451
{
@@ -477,6 +488,10 @@ PREFIX (finalize) (void)
477
488
MPI_Info_free (& mpi_info_same_size );
478
489
#endif // MPI_VERSION
479
490
491
+ # ifndef CAF_MPI_LOCK_UNLOCK
492
+ MPI_Win_unlock_all (* stat_tok );
493
+ # endif // CAF_MPI_LOCK_UNLOCK
494
+ MPI_Win_free (stat_tok );
480
495
MPI_Comm_free (& CAF_COMM_WORLD );
481
496
482
497
/* Only call Finalize if CAF runtime Initialized MPI. */
@@ -486,7 +501,7 @@ PREFIX (finalize) (void)
486
501
pthread_mutex_lock (& lock_am );
487
502
caf_is_finalized = 1 ;
488
503
pthread_mutex_unlock (& lock_am );
489
- exit ( 0 );
504
+ free ( sync_handles );
490
505
}
491
506
492
507
@@ -2988,7 +3003,14 @@ void
2988
3003
PREFIX (sync_images ) (int count , int images [], int * stat , char * errmsg ,
2989
3004
int errmsg_len )
2990
3005
{
2991
- int ierr = 0 , i = 0 , remote_stat = 0 , j = 0 ;
3006
+ sync_images_internal (count , images , stat , errmsg , errmsg_len , false);
3007
+ }
3008
+
3009
+ void
3010
+ sync_images_internal (int count , int images [], int * stat , char * errmsg ,
3011
+ int errmsg_len , bool internal )
3012
+ {
3013
+ int ierr = 0 , i = 0 , j = 0 , int_zero = 0 , done_count = 0 ;
2992
3014
MPI_Status s ;
2993
3015
2994
3016
if (count == 0 || (count == 1 && images [0 ] == caf_this_image ))
@@ -3042,59 +3064,54 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
3042
3064
explicit_flush ();
3043
3065
#endif
3044
3066
3067
+ /* A rather simple way to synchronice:
3068
+ - expect all images to sync with receiving an int,
3069
+ - on the other side, send all processes to sync with an int,
3070
+ - when the int received is STAT_STOPPED_IMAGE the return immediately,
3071
+ else wait until all images in the current set of images have send
3072
+ some data, i.e., synced.
3073
+
3074
+ This approach as best as possible implements the syncing of different
3075
+ sets of images and figuring that an image has stopped. MPI does not
3076
+ provide any direct means of syncing non-coherent sets of images.
3077
+ The groups/communicators of MPI always need to be consistent, i.e.,
3078
+ have the same members on all images participating. This is
3079
+ contradictiory to the sync images statement, where syncing, e.g., in a
3080
+ ring pattern is possible.
3081
+
3082
+ This implementation guarantees, that as long as no image is stopped
3083
+ an image only is allowed to continue, when all its images to sync to
3084
+ also have reached a sync images statement. This implementation makes
3085
+ no assumption when the image continues or in which order synced
3086
+ images continue. */
3045
3087
for (i = 0 ; i < count ; ++ i )
3046
3088
/* Need to have the request handlers contigously in the handlers
3047
3089
array or waitany below will trip about the handler as illegal. */
3048
3090
ierr = MPI_Irecv (& arrived [images [i ] - 1 ], 1 , MPI_INT , images [i ] - 1 , 0 ,
3049
- CAF_COMM_WORLD , & handlers [i ]);
3091
+ CAF_COMM_WORLD , & sync_handles [i ]);
3050
3092
for (i = 0 ; i < count ; ++ i )
3093
+ MPI_Send (& int_zero , 1 , MPI_INT , images [i ] - 1 , 0 , CAF_COMM_WORLD );
3094
+ done_count = 0 ;
3095
+ while (done_count < count )
3051
3096
{
3052
- # ifdef CAF_MPI_LOCK_UNLOCK
3053
- MPI_Win_lock (MPI_LOCK_SHARED , images [i ] - 1 , 0 , * stat_tok );
3054
- # endif // CAF_MPI_LOCK_UNLOCK
3055
- ierr = MPI_Get (& remote_stat , 1 , MPI_INT ,
3056
- images [i ] - 1 , 0 , 1 , MPI_INT , * stat_tok );
3057
- # ifdef CAF_MPI_LOCK_UNLOCK
3058
- MPI_Win_unlock (images [i ] - 1 , * stat_tok );
3059
- # else // CAF_MPI_LOCK_UNLOCK
3060
- MPI_Win_flush (images [i ] - 1 , * stat_tok );
3061
- # endif // CAF_MPI_LOCK_UNLOCK
3062
- if (remote_stat != 0 )
3097
+ ierr = MPI_Waitany (count , sync_handles , & i , & s );
3098
+ if (i != MPI_UNDEFINED )
3063
3099
{
3064
- ierr = STAT_STOPPED_IMAGE ;
3065
- /* Let the other images know, that at least one image is
3066
- stopped by sending STAT_STOPPED_IMAGE instead of our id. */
3067
- for (i = 0 ; i < count ; ++ i )
3068
- MPI_Send (& ierr , 1 , MPI_INT , images [i ] - 1 , 0 , CAF_COMM_WORLD );
3069
- break ;
3070
- }
3071
- }
3072
- if (ierr == 0 )
3073
- {
3074
- int done_count = 0 ;
3075
- for (i = 0 ; i < count ; ++ i )
3076
- {
3077
- if (arrived [images [i ] - 1 ] != STAT_STOPPED_IMAGE )
3078
- /* Only send, when no stopped images have been found. */
3079
- ierr = MPI_Send (& caf_this_image , 1 , MPI_INT , images [i ] - 1 , 0 ,
3080
- CAF_COMM_WORLD );
3081
- else
3082
- ierr = STAT_STOPPED_IMAGE ;
3083
- }
3084
-
3085
- while (ierr != STAT_STOPPED_IMAGE && done_count < count )
3086
- {
3087
- ierr = MPI_Waitany (count , handlers , & i , & s );
3088
- if (i != MPI_UNDEFINED )
3089
- ++ done_count ;
3090
- if (i != MPI_UNDEFINED && arrived [i ] == STAT_STOPPED_IMAGE )
3091
- ierr = STAT_STOPPED_IMAGE ;
3092
- else if (ierr != MPI_SUCCESS )
3093
- break ;
3100
+ ++ done_count ;
3101
+ if (ierr == MPI_SUCCESS && arrived [i ] == STAT_STOPPED_IMAGE )
3102
+ {
3103
+ /* Possible future extension: Abort pending receives. At the
3104
+ moment the receives are discarded by the program
3105
+ termination. For the tested mpi-implementation this is ok.
3106
+ */
3107
+ ierr = STAT_STOPPED_IMAGE ;
3108
+ break ;
3109
+ }
3094
3110
}
3111
+ else if (ierr != MPI_SUCCESS )
3112
+ /* Abort receives here, too, when implemented above. */
3113
+ break ;
3095
3114
}
3096
-
3097
- memset (arrived , 0 , sizeof (int ) * caf_num_images );
3098
3115
}
3099
3116
3100
3117
sync_images_err_chk :
@@ -3117,8 +3134,8 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
3117
3134
if (errmsg_len > len )
3118
3135
memset (& errmsg [len ], ' ' , errmsg_len - len );
3119
3136
}
3120
- else
3121
- caf_runtime_error (msg );
3137
+ else if (! internal )
3138
+ caf_runtime_error (msg );
3122
3139
}
3123
3140
}
3124
3141
0 commit comments