Skip to content

Commit f4411e6

Browse files
author
Alessandro Fanfarillo
committed
Failed images fixed
1 parent 6a1fbab commit f4411e6

File tree

1 file changed

+43
-17
lines changed

1 file changed

+43
-17
lines changed

src/mpi/mpi_caf.c

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ MPI_Comm lock_comm,stopped_comm;
111111
MPI_Request lock_req,stopped_req;
112112
int used_comm = -1, n_failed_imgs=0;
113113
int error_called = 0, fake_error_called = 0;
114-
int *ranks_gc,*ranks_gf; //to be returned by failed images
114+
int *ranks_gc,*ranks_gf, *failed_images_array; //to be returned by failed images
115115
MPI_Errhandler errh,errh_w,errh_fake;
116116
int completed = 0,tmp_lock;
117117
int *stopped_images;
@@ -129,22 +129,34 @@ static void verbose_win_errhandler(MPI_Win* win, int* err, ...) {
129129

130130
static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){
131131
MPI_Comm comm;
132-
int nf,nc,i;
132+
int nf,nc,i,old_nf,j;
133133
MPI_Group group_c, group_f;
134134
comm = *pcomm;
135+
136+
old_nf = n_failed_imgs;
135137

136138
MPIX_Comm_failure_ack(comm);
137139
MPIX_Comm_failure_get_acked(comm, &group_f);
138140
MPI_Group_size(group_f, &nf);
139-
MPI_Comm_group(comm, &group_c);
141+
/* MPI_Comm_group(comm, &group_c); */
142+
MPI_Comm_group(MPI_COMM_WORLD, &group_c);
140143
for(i = 0; i < nf; i++)
141144
ranks_gf[i] = i;
142145
MPI_Group_translate_ranks(group_f, nf, ranks_gf,
143146
group_c, ranks_gc);
144-
for(i = 0; i < nf; i++)
145-
ranks_gc[i]++;
147+
printf("%d in verbose old_nf:%d nf:%d\n",caf_this_image,old_nf,nf);
146148

147149
n_failed_imgs += nf;
150+
j=0;
151+
152+
for(i = old_nf; i < n_failed_imgs; i++)
153+
{
154+
failed_images_array[i] = ranks_gc[j];
155+
printf("Ranks_gc %d\n",ranks_gc[j]);
156+
failed_images_array[i]++;
157+
j++;
158+
}
159+
148160
error_called = 1;
149161
}
150162

@@ -322,7 +334,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
322334

323335
for(i=0;i<n_failed_imgs;i++)
324336
{
325-
if(ranks_gc[i] == value)
337+
if(failed_images_array[i] == value)
326338
{
327339
# ifdef CAF_MPI_LOCK_UNLOCK
328340
MPI_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, 0, win);
@@ -432,6 +444,7 @@ PREFIX (init) (int *argc, char ***argv)
432444
if (caf_num_images == 0)
433445
{
434446
int ierr = 0, i = 0, j = 0;
447+
n_failed_imgs = 0;
435448

436449
int is_init = 0, prior_thread_level = MPI_THREAD_SINGLE;
437450
MPI_Initialized(&is_init);
@@ -505,6 +518,7 @@ PREFIX (init) (int *argc, char ***argv)
505518

506519
ranks_gf = (int*)calloc(caf_num_images,sizeof(int));
507520
ranks_gc = (int*)calloc(caf_num_images,sizeof(int));
521+
failed_images_array = (int*)calloc(caf_num_images,sizeof(int));
508522
stopped_images = (int*)calloc(caf_num_images, sizeof(int));
509523

510524
#if MPI_VERSION >= 3
@@ -522,7 +536,7 @@ PREFIX (init) (int *argc, char ***argv)
522536
*img_status = 0;
523537
MPI_Win_set_errhandler(*stat_tok,errh_w);
524538
}
525-
/* MPI_Barrier(CAF_COMM_WORLD); */
539+
MPI_Barrier(CAF_COMM_WORLD);
526540
}
527541

528542
/* Finalize coarray program. */
@@ -609,8 +623,9 @@ int communicator_shrink(MPI_Comm *comm)
609623
MPI_Comm_set_errhandler( shrunk, errh );
610624
MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank);
611625

612-
MPI_Comm_rank(*comm, &crank);
613-
626+
// MPI_Comm_rank(*comm, &crank);
627+
MPI_Comm_rank(MPI_COMM_WORLD, &crank);
628+
printf("me: %d becomes: %d\n",caf_this_image,crank+1);
614629
/* Split does the magic: removing spare processes and reordering ranks
615630
* so that all surviving processes remain at their former place */
616631
if (*img_status == STAT_STOPPED_IMAGE)
@@ -863,6 +878,15 @@ void
863878
PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
864879
{
865880
int ierr=0,flag=0;
881+
882+
if(error_called == 1)
883+
{
884+
printf("%d First if in sync all\n",caf_this_image);
885+
communicator_shrink(&CAF_COMM_WORLD);
886+
error_called = 0;
887+
ierr = STAT_FAILED_IMAGE;
888+
/* MPI_Barrier(CAF_COMM_WORLD); */
889+
}
866890

867891
if (unlikely (caf_is_finalized))
868892
ierr = STAT_STOPPED_IMAGE;
@@ -876,6 +900,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
876900

877901
if(error_called == 1)
878902
{
903+
printf("%d Second if in sync all\n",caf_this_image);
879904
communicator_shrink(&CAF_COMM_WORLD);
880905
error_called = 0;
881906
ierr = STAT_FAILED_IMAGE;
@@ -1221,7 +1246,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
12211246
#endif // CAF_MPI_LOCK_UNLOCK
12221247
}
12231248

1224-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1249+
/* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
12251250

12261251
if(error_called == 1)
12271252
{
@@ -1338,10 +1363,11 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
13381363
MPI_Win_flush (image_index-1, *p);
13391364
# endif // CAF_MPI_LOCK_UNLOCK
13401365

1341-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1366+
/* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
13421367

13431368
if(error_called == 1)
13441369
{
1370+
printf("%d In second shrink\n",caf_this_image);
13451371
communicator_shrink(&CAF_COMM_WORLD);
13461372
error_called = 0;
13471373
ierr = STAT_FAILED_IMAGE;
@@ -1479,7 +1505,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
14791505
MPI_Win_flush (image_index-1, *p);
14801506
# endif // CAF_MPI_LOCK_UNLOCK
14811507
#endif
1482-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1508+
/* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
14831509

14841510
if(error_called == 1)
14851511
{
@@ -1571,7 +1597,7 @@ PREFIX (get) (caf_token_t token, size_t offset,
15711597
# else // CAF_MPI_LOCK_UNLOCK
15721598
MPI_Win_flush (image_index-1, *p);
15731599
# endif // CAF_MPI_LOCK_UNLOCK
1574-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1600+
/* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
15751601

15761602
if(error_called == 1)
15771603
{
@@ -1680,7 +1706,7 @@ PREFIX (get) (caf_token_t token, size_t offset,
16801706

16811707
ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p);
16821708

1683-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1709+
/* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
16841710

16851711
if(error_called == 1)
16861712
{
@@ -2802,7 +2828,7 @@ PREFIX (image_status) (int image)
28022828
int i,res=0, remote_stat=0,ierr;
28032829

28042830
for(i=0;i<n_failed_imgs;i++)
2805-
if(image == ranks_gc[i])
2831+
if(image == failed_images_array[i])
28062832
res = STAT_FAILED_IMAGE;
28072833

28082834
if(res == STAT_FAILED_IMAGE)
@@ -2832,10 +2858,10 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused
28322858
{
28332859
int *mem = (int *)calloc(n_failed_imgs,sizeof(int));
28342860
array->base_addr = mem;
2835-
memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int));
2861+
memcpy(mem,failed_images_array,n_failed_imgs*sizeof(int));
28362862
qsort(mem,n_failed_imgs,sizeof(int),cmpfunc);
28372863
array->dtype = 265;
2838-
array->dim[0].lower_bound = 0;
2864+
array->dim[0].lower_bound = 1;
28392865
array->dim[0]._ubound = n_failed_imgs-1;
28402866
array->dim[0]._stride = 1;
28412867
array->offset = -1;

0 commit comments

Comments
 (0)