Skip to content

Commit b5d2bef

Browse files
authored
Merge branch 'master' into issue305
2 parents 0091963 + 8053475 commit b5d2bef

File tree

7 files changed

+129
-85
lines changed

7 files changed

+129
-85
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ if(opencoarrays_aware_compiler)
391391
add_mpi_test(duplicate_syncimages 8 ${tests_root}/unit/sync/duplicate_syncimages)
392392
add_mpi_test(co_reduce 4 ${tests_root}/unit/collectives/co_reduce_test)
393393
add_mpi_test(syncimages_status 32 ${tests_root}/unit/sync/syncimages_status)
394+
add_mpi_test(sync_ring_abort_np3 3 ${tests_root}/unit/sync/sync_image_ring_abort_on_stopped_image)
395+
add_mpi_test(sync_ring_abort_np7 7 ${tests_root}/unit/sync/sync_image_ring_abort_on_stopped_image)
394396
add_mpi_test(simpleatomics 32 ${tests_root}/unit/simple/atomics)
395397
# possible logic error in the following test
396398
# add_mpi_test(increment_my_neighbor 32 ${tests_root}/unit/simple/increment_my_neighbor)

src/mpi/mpi_caf.c

Lines changed: 89 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ static int caf_is_finalized = 0;
7575
/*Sync image part*/
7676

7777
static int *orders;
78-
MPI_Request *handlers;
7978
static int *images_full;
79+
MPI_Request *sync_handles;
8080
static int *arrived;
8181

8282
/* Pending puts */
@@ -96,8 +96,8 @@ caf_static_t *caf_tot = NULL;
9696

9797
/* Image status variable */
9898

99-
static int *img_status = NULL;
100-
MPI_Win *stat_tok;
99+
static int img_status = 0;
100+
static MPI_Win *stat_tok;
101101

102102
/* Active messages variables */
103103

@@ -400,27 +400,31 @@ PREFIX (init) (int *argc, char ***argv)
400400
orders = calloc (caf_num_images, sizeof (int));
401401
arrived = calloc (caf_num_images, sizeof (int));
402402

403-
handlers = malloc(caf_num_images * sizeof(MPI_Request));
403+
sync_handles = malloc(caf_num_images * sizeof(MPI_Request));
404404

405405
stat_tok = malloc (sizeof(MPI_Win));
406406

407407
#if MPI_VERSION >= 3
408408
MPI_Info_create (&mpi_info_same_size);
409409
MPI_Info_set (mpi_info_same_size, "same_size", "true");
410410
/* Setting img_status */
411-
MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, CAF_COMM_WORLD, &img_status, stat_tok);
411+
MPI_Win_create(&img_status, sizeof(int), 1, mpi_info_same_size, CAF_COMM_WORLD, stat_tok);
412412
# ifndef CAF_MPI_LOCK_UNLOCK
413413
MPI_Win_lock_all(MPI_MODE_NOCHECK, *stat_tok);
414414
# endif // CAF_MPI_LOCK_UNLOCK
415415
#else
416416
MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &img_status, stat_tok);
417417
MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, CAF_COMM_WORLD, stat_tok);
418418
#endif // MPI_VERSION
419-
*img_status = 0;
420419
}
421420
/* MPI_Barrier(CAF_COMM_WORLD); */
422421
}
423422

423+
/* Forward declaration of sync_images. */
424+
425+
void
426+
sync_images_internal (int count, int images[], int *stat, char *errmsg,
427+
int errmsg_len, bool internal);
424428

425429
/* Finalize coarray program. */
426430

@@ -431,10 +435,17 @@ _gfortran_caf_finalize (void)
431435
PREFIX (finalize) (void)
432436
#endif
433437
{
434-
*img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */
435-
MPI_Win_sync(*stat_tok);
438+
/* For future security enclose setting img_status in a lock. */
439+
CAF_Win_lock (MPI_LOCK_EXCLUSIVE, caf_this_image - 1, *stat_tok);
440+
img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */
441+
CAF_Win_unlock (caf_this_image - 1, *stat_tok);
442+
443+
/* Announce to all other images, that this one is stopped. */
444+
for (int i = 0; i < caf_num_images - 1; ++i)
445+
MPI_Send (&img_status, 1, MPI_INT, images_full[i] - 1, 0, CAF_COMM_WORLD);
436446

437-
MPI_Barrier(CAF_COMM_WORLD);
447+
/* Add a conventional barrier to prevent images from quitting to early. */
448+
MPI_Barrier (CAF_COMM_WORLD);
438449

439450
while (caf_static_list != NULL)
440451
{
@@ -464,19 +475,23 @@ PREFIX (finalize) (void)
464475
MPI_Win_free (mpi_token->desc);
465476
free (mpi_token->desc);
466477
}
467-
#else
478+
#else // GCC_GE_7
468479
# ifndef CAF_MPI_LOCK_UNLOCK
469480
MPI_Win_unlock_all(*p);
470481
# endif // CAF_MPI_LOCK_UNLOCK
482+
#endif // GCC_GE_7
471483
MPI_Win_free(p);
472-
#endif
473484
free(tmp_tot);
474485
tmp_tot = prev;
475486
}
476487
#if MPI_VERSION >= 3
477488
MPI_Info_free (&mpi_info_same_size);
478489
#endif // MPI_VERSION
479490

491+
# ifndef CAF_MPI_LOCK_UNLOCK
492+
MPI_Win_unlock_all (*stat_tok);
493+
# endif // CAF_MPI_LOCK_UNLOCK
494+
MPI_Win_free (stat_tok);
480495
MPI_Comm_free(&CAF_COMM_WORLD);
481496

482497
/* Only call Finalize if CAF runtime Initialized MPI. */
@@ -486,7 +501,7 @@ PREFIX (finalize) (void)
486501
pthread_mutex_lock(&lock_am);
487502
caf_is_finalized = 1;
488503
pthread_mutex_unlock(&lock_am);
489-
exit(0);
504+
free (sync_handles);
490505
}
491506

492507

@@ -541,23 +556,21 @@ PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token,
541556
mpi_token = (mpi_caf_token_t *) *token;
542557
p = TOKEN(mpi_token);
543558

544-
if (type == CAF_REGTYPE_COARRAY_ALLOC_ALLOCATE_ONLY
545-
|| type == CAF_REGTYPE_COARRAY_ALLOC
546-
|| type == CAF_REGTYPE_COARRAY_STATIC)
559+
if ((type == CAF_REGTYPE_COARRAY_ALLOC_ALLOCATE_ONLY
560+
|| type == CAF_REGTYPE_COARRAY_ALLOC
561+
|| type == CAF_REGTYPE_COARRAY_STATIC)
562+
&& GFC_DESCRIPTOR_RANK (desc) != 0)
547563
{
548-
if (GFC_DESCRIPTOR_RANK (desc) == 0)
549-
mpi_token->desc = NULL;
550-
else
551-
{
552-
int ierr;
553-
size_t desc_size = sizeof (gfc_descriptor_t) + /*GFC_DESCRIPTOR_RANK (desc)*/
554-
GFC_MAX_DIMENSIONS * sizeof (descriptor_dimension);
555-
mpi_token->desc = (MPI_Win *)malloc (sizeof (MPI_Win));
556-
ierr = MPI_Win_create (desc, desc_size, 1, mpi_info_same_size,
557-
CAF_COMM_WORLD, mpi_token->desc);
558-
CAF_Win_lock_all (*(mpi_token->desc));
559-
}
564+
int ierr;
565+
size_t desc_size = sizeof (gfc_descriptor_t) + /*GFC_DESCRIPTOR_RANK (desc)*/
566+
GFC_MAX_DIMENSIONS * sizeof (descriptor_dimension);
567+
mpi_token->desc = (MPI_Win *)malloc (sizeof (MPI_Win));
568+
ierr = MPI_Win_create (desc, desc_size, 1, mpi_info_same_size,
569+
CAF_COMM_WORLD, mpi_token->desc);
570+
CAF_Win_lock_all (*(mpi_token->desc));
560571
}
572+
else
573+
mpi_token->desc = NULL;
561574

562575
#if MPI_VERSION >= 3
563576
if (type != CAF_REGTYPE_COARRAY_ALLOC_REGISTER_ONLY)
@@ -2990,7 +3003,14 @@ void
29903003
PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
29913004
int errmsg_len)
29923005
{
2993-
int ierr = 0, i = 0, remote_stat = 0, j = 0;
3006+
sync_images_internal (count, images, stat, errmsg, errmsg_len, false);
3007+
}
3008+
3009+
void
3010+
sync_images_internal (int count, int images[], int *stat, char *errmsg,
3011+
int errmsg_len, bool internal)
3012+
{
3013+
int ierr = 0, i = 0, j = 0, int_zero = 0, done_count = 0;
29943014
MPI_Status s;
29953015

29963016
if (count == 0 || (count == 1 && images[0] == caf_this_image))
@@ -3044,59 +3064,54 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
30443064
explicit_flush();
30453065
#endif
30463066

3067+
/* A rather simple way to synchronice:
3068+
- expect all images to sync with receiving an int,
3069+
- on the other side, send all processes to sync with an int,
3070+
- when the int received is STAT_STOPPED_IMAGE the return immediately,
3071+
else wait until all images in the current set of images have send
3072+
some data, i.e., synced.
3073+
3074+
This approach as best as possible implements the syncing of different
3075+
sets of images and figuring that an image has stopped. MPI does not
3076+
provide any direct means of syncing non-coherent sets of images.
3077+
The groups/communicators of MPI always need to be consistent, i.e.,
3078+
have the same members on all images participating. This is
3079+
contradictiory to the sync images statement, where syncing, e.g., in a
3080+
ring pattern is possible.
3081+
3082+
This implementation guarantees, that as long as no image is stopped
3083+
an image only is allowed to continue, when all its images to sync to
3084+
also have reached a sync images statement. This implementation makes
3085+
no assumption when the image continues or in which order synced
3086+
images continue. */
30473087
for(i = 0; i < count; ++i)
30483088
/* Need to have the request handlers contigously in the handlers
30493089
array or waitany below will trip about the handler as illegal. */
30503090
ierr = MPI_Irecv (&arrived[images[i] - 1], 1, MPI_INT, images[i] - 1, 0,
3051-
CAF_COMM_WORLD, &handlers[i]);
3091+
CAF_COMM_WORLD, &sync_handles[i]);
30523092
for(i = 0; i < count; ++i)
3093+
MPI_Send (&int_zero, 1, MPI_INT, images[i] - 1, 0, CAF_COMM_WORLD);
3094+
done_count = 0;
3095+
while (done_count < count)
30533096
{
3054-
# ifdef CAF_MPI_LOCK_UNLOCK
3055-
MPI_Win_lock (MPI_LOCK_SHARED, images[i] - 1, 0, *stat_tok);
3056-
# endif // CAF_MPI_LOCK_UNLOCK
3057-
ierr = MPI_Get (&remote_stat, 1, MPI_INT,
3058-
images[i] - 1, 0, 1, MPI_INT, *stat_tok);
3059-
# ifdef CAF_MPI_LOCK_UNLOCK
3060-
MPI_Win_unlock (images[i] - 1, *stat_tok);
3061-
# else // CAF_MPI_LOCK_UNLOCK
3062-
MPI_Win_flush (images[i] - 1, *stat_tok);
3063-
# endif // CAF_MPI_LOCK_UNLOCK
3064-
if(remote_stat != 0)
3065-
{
3066-
ierr = STAT_STOPPED_IMAGE;
3067-
/* Let the other images know, that at least one image is
3068-
stopped by sending STAT_STOPPED_IMAGE instead of our id. */
3069-
for(i = 0; i < count; ++i)
3070-
MPI_Send (&ierr, 1, MPI_INT, images[i] - 1, 0, CAF_COMM_WORLD);
3071-
break;
3072-
}
3073-
}
3074-
if (ierr == 0)
3075-
{
3076-
int done_count = 0;
3077-
for(i = 0; i < count; ++i)
3097+
ierr = MPI_Waitany (count, sync_handles, &i, &s);
3098+
if (i != MPI_UNDEFINED)
30783099
{
3079-
if (arrived[images[i] - 1] != STAT_STOPPED_IMAGE)
3080-
/* Only send, when no stopped images have been found. */
3081-
ierr = MPI_Send (&caf_this_image, 1, MPI_INT, images[i] - 1, 0,
3082-
CAF_COMM_WORLD);
3083-
else
3084-
ierr = STAT_STOPPED_IMAGE;
3085-
}
3086-
3087-
while (ierr != STAT_STOPPED_IMAGE && done_count < count)
3088-
{
3089-
ierr = MPI_Waitany (count, handlers, &i, &s);
3090-
if (i != MPI_UNDEFINED)
3091-
++done_count;
3092-
if (i != MPI_UNDEFINED && arrived[i] == STAT_STOPPED_IMAGE)
3093-
ierr = STAT_STOPPED_IMAGE;
3094-
else if (ierr != MPI_SUCCESS)
3095-
break;
3100+
++done_count;
3101+
if (ierr == MPI_SUCCESS && arrived[i] == STAT_STOPPED_IMAGE)
3102+
{
3103+
/* Possible future extension: Abort pending receives. At the
3104+
moment the receives are discarded by the program
3105+
termination. For the tested mpi-implementation this is ok.
3106+
*/
3107+
ierr = STAT_STOPPED_IMAGE;
3108+
break;
3109+
}
30963110
}
3111+
else if (ierr != MPI_SUCCESS)
3112+
/* Abort receives here, too, when implemented above. */
3113+
break;
30973114
}
3098-
3099-
memset(arrived, 0, sizeof(int) * caf_num_images);
31003115
}
31013116

31023117
sync_images_err_chk:
@@ -3119,8 +3134,8 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
31193134
if (errmsg_len > len)
31203135
memset (&errmsg[len], ' ', errmsg_len-len);
31213136
}
3122-
else
3123-
caf_runtime_error (msg);
3137+
else if (!internal)
3138+
caf_runtime_error (msg);
31243139
}
31253140
}
31263141

src/tests/integration/events/async-hello.F90

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,7 @@ program main
6969
end do
7070

7171
sync all
72-
! Hardwire failure with GCC 7 (remove this preprocessor conditional after the silent failure with GCC 7 has been eliminated)
73-
#if __GNUC__ >= 7
74-
#else
7572
if (me==1) print *,"Test passed."
76-
#endif
7773

7874
end associate
7975

src/tests/regression/reported/issue-293-silent-event-failure.F90

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,13 @@ program main
3434
!! date: 2016-12-22
3535
!! category: regression
3636
!! Test posting a static event_type coarray
37-
!! OpenCoarrays issue #293
37+
!! OpenCoarrays issue [#293](https://github.com/sourceryinstitute/opencoarrays/issues/293)
3838
use iso_fortran_env, only : event_type
3939
implicit none
4040
type(event_type) :: test_post[*]
4141

4242
if (num_images()<3) error stop "exposing issue 293 requires num_images()>=3"
4343
event post(test_post)
44-
! Hardwire failure with GCC 7 (remove this preprocessor conditional after the silent failure with GCC 7 has been eliminated)
45-
#if __GNUC__ >= 7
46-
#else
47-
if (this_image()==1) print *,"Test passed."
48-
#endif
44+
if (this_image()==1) print *,"Test passed."
4945

5046
end program

src/tests/unit/sync/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,6 @@ target_link_libraries(duplicate_syncimages OpenCoarrays)
1212

1313
add_executable(syncimages_status syncimages_status.f90)
1414
target_link_libraries(syncimages_status OpenCoarrays)
15+
16+
add_executable(sync_image_ring_abort_on_stopped_image sync_image_ring_abort_on_stopped_image.f90)
17+
target_link_libraries(sync_image_ring_abort_on_stopped_image OpenCoarrays)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
program sync_image_ring_abort_on_stopped_image
2+
!! `SYNC IMAGES([this_image - 1, this_image + 1])` with
3+
!! `STAT=STAT_STOPPED_IMAGE` specifier on a periodic ring. The test
4+
!! checks that syncing in a ring with a stopped image still
5+
!! terminates all images. All images other than image 1 participate
6+
!! in the `sync images()` call
7+
8+
use, intrinsic:: iso_fortran_env
9+
implicit none
10+
11+
integer :: stat_var = 0
12+
13+
if (num_images() .lt. 2) error stop "Need at least two images to test."
14+
15+
associate (me => this_image())
16+
if (me == 1) then
17+
continue !! image 1 does not participate and exits, creating a stopped image
18+
else
19+
associate (lhs => merge(me - 1, num_images(), me /= 1), &
20+
rhs => merge(me + 1, 1, me /= num_images()))
21+
sync images([lhs, rhs], STAT=stat_var)
22+
!! Only images bordering image 1 (i.e., 2 and `num_images()`) can
23+
!! accurately test whether a stopped image is present. All other
24+
!! images could be up ahead.
25+
if (stat_var /= STAT_STOPPED_IMAGE .and. me == 2) &
26+
error stop "Error: stat_var /= STAT_STOPPED_IMAGE: "
27+
if (stat_var /= STAT_STOPPED_IMAGE .and. me == num_images()) &
28+
error stop "Error: stat_var /= STAT_STOPPED_IMAGE: "
29+
if(me == 2) print *, 'Test passed.'
30+
end associate
31+
end if
32+
end associate
33+
end program

src/tests/unit/sync/syncimages_status.f90

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ program sync_images_stat
1010
me = this_image()
1111

1212
if (me /= 1 ) then
13-
call sleep(1)
1413
sync images(*,STAT=stat_var)
1514
if ( stat_var /= STAT_STOPPED_IMAGE) then
1615
print *, "Error:stat_var /= STAT_STOPPED_IMAGE: ", me

0 commit comments

Comments
 (0)