Skip to content

Commit 0264344

Browse files
author
Alessandro Fanfarillo
committed
Stopped images
1 parent c2887c7 commit 0264344

File tree

1 file changed

+59
-30
lines changed

1 file changed

+59
-30
lines changed

src/mpi/mpi_caf.c

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ caf_static_t *caf_tot = NULL;
8787

8888
/* Image status variable */
8989

90-
static int *img_status = NULL;
90+
static int img_status = 0;
9191
MPI_Win *stat_tok;
9292

9393
/* Active messages variables */
@@ -114,7 +114,7 @@ int error_called = 0, fake_error_called = 0;
114114
int *ranks_gc,*ranks_gf, *failed_images_array; //to be returned by failed images
115115
MPI_Errhandler errh,errh_w,errh_fake;
116116
int completed = 0,tmp_lock;
117-
int *stopped_images, n_stopped_imgs;
117+
int *stopped_imgs, n_stopped_imgs;
118118

119119
static int cmpfunc (const void *a, const void *b)
120120
{
@@ -511,27 +511,30 @@ PREFIX (init) (int *argc, char ***argv)
511511
MPI_Comm_dup(CAF_COMM_WORLD, &stopped_comm);
512512
MPI_Comm_set_errhandler(stopped_comm, errh);
513513
MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,stopped_comm,&stopped_req);
514-
514+
515515
MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w);
516516

517517
ranks_gf = (int*)calloc(caf_num_images,sizeof(int));
518518
ranks_gc = (int*)calloc(caf_num_images,sizeof(int));
519519
failed_images_array = (int*)calloc(caf_num_images,sizeof(int));
520-
stopped_images = (int*)calloc(caf_num_images, sizeof(int));
521-
520+
/* stopped_imgs = (int*)calloc(caf_num_images, sizeof(int)); */
521+
522522
#if MPI_VERSION >= 3
523-
MPI_Info_create (&mpi_info_same_size);
524-
MPI_Info_set (mpi_info_same_size, "same_size", "true");
523+
/* MPI_Info_create (&mpi_info_same_size); */
524+
/* MPI_Info_set (mpi_info_same_size, "same_size", "true"); */
525525
/* Setting img_status */
526-
MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, stopped_comm, &img_status, stat_tok);
526+
/* MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, stopped_comm, &img_status, stat_tok); */
527+
MPI_Win_allocate(sizeof(int)*caf_num_images, 1, MPI_INFO_NULL, stopped_comm, &stopped_imgs, stat_tok);
527528
# ifndef CAF_MPI_LOCK_UNLOCK
528529
MPI_Win_lock_all(MPI_MODE_NOCHECK, *stat_tok);
529530
# endif // CAF_MPI_LOCK_UNLOCK
530531
#else
531-
MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &img_status, stat_tok);
532-
MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, stopped_comm, stat_tok);
532+
MPI_Alloc_mem(sizeof(int)*caf_num_images, MPI_INFO_NULL, &stopped_imgs, stat_tok);
533+
MPI_Win_create(stopped_imgs, sizeof(int)*caf_num_images, 1, MPI_INFO_NULL, stopped_comm, stat_tok);
533534
#endif // MPI_VERSION
534-
*img_status = 0;
535+
for(i=0;i<caf_num_images;i++)
536+
stopped_imgs[i] = 0;
537+
/* *img_status = 0; */
535538
MPI_Win_set_errhandler(*stat_tok,errh_w);
536539
}
537540
MPI_Barrier(CAF_COMM_WORLD);
@@ -546,9 +549,27 @@ _gfortran_caf_finalize (void)
546549
PREFIX (finalize) (void)
547550
#endif
548551
{
549-
int flag = 0;
550-
*img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */
551-
MPI_Win_sync(*stat_tok);
552+
int flag = 0,i=0,j=0,failed=0,one=1;
553+
img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */
554+
stopped_imgs[caf_this_image-1] = 1;
555+
/* MPI_Win_sync(*stat_tok); */
556+
557+
for(i=0;i<caf_num_images;i++)
558+
{
559+
for (j = 0; j < n_failed_imgs; j++)
560+
if (i == (failed_images_array[j] - 1))
561+
{
562+
failed = 1;
563+
break;
564+
}
565+
if (!failed)
566+
{
567+
MPI_Accumulate (&one, 1, MPI_INT, i, (caf_this_image-1)*sizeof(int), 1, MPI_INT, MPI_REPLACE, *stat_tok);
568+
}
569+
failed = 0;
570+
}
571+
572+
MPI_Win_flush_all(*stat_tok);
552573

553574
MPIX_Comm_revoke(CAF_COMM_WORLD);
554575
communicator_shrink(&CAF_COMM_WORLD);
@@ -578,7 +599,7 @@ PREFIX (finalize) (void)
578599
tmp_tot = prev;
579600
}
580601
#if MPI_VERSION >= 3
581-
MPI_Info_free (&mpi_info_same_size);
602+
/* MPI_Info_free (&mpi_info_same_size); */
582603
#endif // MPI_VERSION
583604

584605
/* MPI_Comm_free(&CAF_COMM_WORLD); */
@@ -625,7 +646,7 @@ int communicator_shrink(MPI_Comm *comm)
625646
MPI_Comm_rank(MPI_COMM_WORLD, &crank);
626647
/* Split does the magic: removing spare processes and reordering ranks
627648
* so that all surviving processes remain at their former place */
628-
if (*img_status == STAT_STOPPED_IMAGE)
649+
if (img_status == STAT_STOPPED_IMAGE)
629650
crank = -1;
630651
rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm);
631652
flag = (rc == MPI_SUCCESS);
@@ -2819,7 +2840,7 @@ PREFIX (fail_image) (void)
28192840
int
28202841
PREFIX (image_status) (int image)
28212842
{
2822-
int i,res=0, remote_stat=0,ierr;
2843+
int i,res=0, ierr;
28232844

28242845
for(i=0;i<n_failed_imgs;i++)
28252846
if(image == failed_images_array[i])
@@ -2828,16 +2849,16 @@ PREFIX (image_status) (int image)
28282849
if(res == STAT_FAILED_IMAGE)
28292850
return res;
28302851

2831-
# ifdef CAF_MPI_LOCK_UNLOCK
2832-
MPI_Win_lock (MPI_LOCK_SHARED, image-1, 0, *stat_tok);
2833-
# endif // CAF_MPI_LOCK_UNLOCK
2834-
ierr = MPI_Get (&remote_stat, 1, MPI_INT, image-1, 0, 1, MPI_INT, *stat_tok);
2835-
# ifdef CAF_MPI_LOCK_UNLOCK
2836-
MPI_Win_unlock (image-1, *stat_tok);
2837-
# else // CAF_MPI_LOCK_UNLOCK
2838-
MPI_Win_flush (image-1, *stat_tok);
2839-
# endif // CAF_MPI_LOCK_UNLOCK
2840-
if(remote_stat != 0)
2852+
/* # ifdef CAF_MPI_LOCK_UNLOCK */
2853+
/* MPI_Win_lock (MPI_LOCK_SHARED, image-1, 0, *stat_tok); */
2854+
/* # endif // CAF_MPI_LOCK_UNLOCK */
2855+
/* ierr = MPI_Get (&remote_stat, 1, MPI_INT, image-1, 0, 1, MPI_INT, *stat_tok); */
2856+
/* # ifdef CAF_MPI_LOCK_UNLOCK */
2857+
/* MPI_Win_unlock (image-1, *stat_tok); */
2858+
/* # else // CAF_MPI_LOCK_UNLOCK */
2859+
/* MPI_Win_flush (image-1, *stat_tok); */
2860+
/* # endif // CAF_MPI_LOCK_UNLOCK */
2861+
if(stopped_imgs[image-1] != 0)
28412862
res = STAT_STOPPED_IMAGE;
28422863

28432864
if(ierr != MPI_SUCCESS)
@@ -2865,10 +2886,18 @@ void
28652886
PREFIX (stopped_images) (gfc_descriptor_t *array, int team __attribute__ ((unused)),
28662887
int kind __attribute__ ((unused)))
28672888
{
2868-
int *mem = (int *)calloc(n_stopped_imgs,sizeof(int));
2889+
int i, *mem,j=0;
2890+
n_stopped_imgs = 0;
2891+
mem = (int *)calloc(caf_num_images,sizeof(int));
2892+
for(i=0;i<caf_num_images;i++)
2893+
if(stopped_imgs[i] == 1)
2894+
{
2895+
mem[j] = i+1;
2896+
j++;
2897+
}
2898+
n_stopped_imgs = j;
2899+
mem = realloc(mem,sizeof(int)*n_stopped_imgs);
28692900
array->base_addr = mem;
2870-
memcpy(mem,stopped_images,n_stopped_imgs*sizeof(int));
2871-
qsort(mem,n_stopped_imgs,sizeof(int),cmpfunc);
28722901
array->dtype = 265;
28732902
array->dim[0].lower_bound = 1;
28742903
array->dim[0]._ubound = n_stopped_imgs-1;

0 commit comments

Comments
 (0)