@@ -111,7 +111,7 @@ MPI_Comm lock_comm,stopped_comm;
111
111
MPI_Request lock_req ,stopped_req ;
112
112
int used_comm = -1 , n_failed_imgs = 0 ;
113
113
int error_called = 0 , fake_error_called = 0 ;
114
- int * ranks_gc ,* ranks_gf ; //to be returned by failed images
114
+ int * ranks_gc ,* ranks_gf , * failed_images_array ; //to be returned by failed images
115
115
MPI_Errhandler errh ,errh_w ,errh_fake ;
116
116
int completed = 0 ,tmp_lock ;
117
117
int * stopped_images ;
@@ -129,22 +129,34 @@ static void verbose_win_errhandler(MPI_Win* win, int* err, ...) {
129
129
130
130
static void verbose_comm_errhandler (MPI_Comm * pcomm , int * err , ...){
131
131
MPI_Comm comm ;
132
- int nf ,nc ,i ;
132
+ int nf ,nc ,i , old_nf , j ;
133
133
MPI_Group group_c , group_f ;
134
134
comm = * pcomm ;
135
+
136
+ old_nf = n_failed_imgs ;
135
137
136
138
MPIX_Comm_failure_ack (comm );
137
139
MPIX_Comm_failure_get_acked (comm , & group_f );
138
140
MPI_Group_size (group_f , & nf );
139
- MPI_Comm_group (comm , & group_c );
141
+ /* MPI_Comm_group(comm, &group_c); */
142
+ MPI_Comm_group (MPI_COMM_WORLD , & group_c );
140
143
for (i = 0 ; i < nf ; i ++ )
141
144
ranks_gf [i ] = i ;
142
145
MPI_Group_translate_ranks (group_f , nf , ranks_gf ,
143
146
group_c , ranks_gc );
144
- for (i = 0 ; i < nf ; i ++ )
145
- ranks_gc [i ]++ ;
147
+ printf ("%d in verbose old_nf:%d nf:%d\n" ,caf_this_image ,old_nf ,nf );
146
148
147
149
n_failed_imgs += nf ;
150
+ j = 0 ;
151
+
152
+ for (i = old_nf ; i < n_failed_imgs ; i ++ )
153
+ {
154
+ failed_images_array [i ] = ranks_gc [j ];
155
+ printf ("Ranks_gc %d\n" ,ranks_gc [j ]);
156
+ failed_images_array [i ]++ ;
157
+ j ++ ;
158
+ }
159
+
148
160
error_called = 1 ;
149
161
}
150
162
@@ -322,7 +334,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
322
334
323
335
for (i = 0 ;i < n_failed_imgs ;i ++ )
324
336
{
325
- if (ranks_gc [i ] == value )
337
+ if (failed_images_array [i ] == value )
326
338
{
327
339
# ifdef CAF_MPI_LOCK_UNLOCK
328
340
MPI_Win_lock (MPI_LOCK_EXCLUSIVE , image_index - 1 , 0 , win );
@@ -432,6 +444,7 @@ PREFIX (init) (int *argc, char ***argv)
432
444
if (caf_num_images == 0 )
433
445
{
434
446
int ierr = 0 , i = 0 , j = 0 ;
447
+ n_failed_imgs = 0 ;
435
448
436
449
int is_init = 0 , prior_thread_level = MPI_THREAD_SINGLE ;
437
450
MPI_Initialized (& is_init );
@@ -505,6 +518,7 @@ PREFIX (init) (int *argc, char ***argv)
505
518
506
519
ranks_gf = (int * )calloc (caf_num_images ,sizeof (int ));
507
520
ranks_gc = (int * )calloc (caf_num_images ,sizeof (int ));
521
+ failed_images_array = (int * )calloc (caf_num_images ,sizeof (int ));
508
522
stopped_images = (int * )calloc (caf_num_images , sizeof (int ));
509
523
510
524
#if MPI_VERSION >= 3
@@ -522,7 +536,7 @@ PREFIX (init) (int *argc, char ***argv)
522
536
* img_status = 0 ;
523
537
MPI_Win_set_errhandler (* stat_tok ,errh_w );
524
538
}
525
- /* MPI_Barrier(CAF_COMM_WORLD); */
539
+ MPI_Barrier (CAF_COMM_WORLD );
526
540
}
527
541
528
542
/* Finalize coarray program. */
@@ -609,8 +623,9 @@ int communicator_shrink(MPI_Comm *comm)
609
623
MPI_Comm_set_errhandler ( shrunk , errh );
610
624
MPI_Comm_size (shrunk , & ns ); MPI_Comm_rank (shrunk , & srank );
611
625
612
- MPI_Comm_rank (* comm , & crank );
613
-
626
+ // MPI_Comm_rank(*comm, &crank);
627
+ MPI_Comm_rank (MPI_COMM_WORLD , & crank );
628
+ printf ("me: %d becomes: %d\n" ,caf_this_image ,crank + 1 );
614
629
/* Split does the magic: removing spare processes and reordering ranks
615
630
* so that all surviving processes remain at their former place */
616
631
if (* img_status == STAT_STOPPED_IMAGE )
@@ -863,6 +878,15 @@ void
863
878
PREFIX (sync_all ) (int * stat , char * errmsg , int errmsg_len )
864
879
{
865
880
int ierr = 0 ,flag = 0 ;
881
+
882
+ if (error_called == 1 )
883
+ {
884
+ printf ("%d First if in sync all\n" ,caf_this_image );
885
+ communicator_shrink (& CAF_COMM_WORLD );
886
+ error_called = 0 ;
887
+ ierr = STAT_FAILED_IMAGE ;
888
+ /* MPI_Barrier(CAF_COMM_WORLD); */
889
+ }
866
890
867
891
if (unlikely (caf_is_finalized ))
868
892
ierr = STAT_STOPPED_IMAGE ;
@@ -876,6 +900,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
876
900
877
901
if (error_called == 1 )
878
902
{
903
+ printf ("%d Second if in sync all\n" ,caf_this_image );
879
904
communicator_shrink (& CAF_COMM_WORLD );
880
905
error_called = 0 ;
881
906
ierr = STAT_FAILED_IMAGE ;
@@ -1221,7 +1246,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1221
1246
#endif // CAF_MPI_LOCK_UNLOCK
1222
1247
}
1223
1248
1224
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1249
+ /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
1225
1250
1226
1251
if (error_called == 1 )
1227
1252
{
@@ -1338,10 +1363,11 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1338
1363
MPI_Win_flush (image_index - 1 , * p );
1339
1364
# endif // CAF_MPI_LOCK_UNLOCK
1340
1365
1341
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1366
+ /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
1342
1367
1343
1368
if (error_called == 1 )
1344
1369
{
1370
+ printf ("%d In second shrink\n" ,caf_this_image );
1345
1371
communicator_shrink (& CAF_COMM_WORLD );
1346
1372
error_called = 0 ;
1347
1373
ierr = STAT_FAILED_IMAGE ;
@@ -1479,7 +1505,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1479
1505
MPI_Win_flush (image_index - 1 , * p );
1480
1506
# endif // CAF_MPI_LOCK_UNLOCK
1481
1507
#endif
1482
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1508
+ /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
1483
1509
1484
1510
if (error_called == 1 )
1485
1511
{
@@ -1571,7 +1597,7 @@ PREFIX (get) (caf_token_t token, size_t offset,
1571
1597
# else // CAF_MPI_LOCK_UNLOCK
1572
1598
MPI_Win_flush (image_index - 1 , * p );
1573
1599
# endif // CAF_MPI_LOCK_UNLOCK
1574
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1600
+ /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
1575
1601
1576
1602
if (error_called == 1 )
1577
1603
{
@@ -1680,7 +1706,7 @@ PREFIX (get) (caf_token_t token, size_t offset,
1680
1706
1681
1707
ierr = MPI_Get (dst , 1 , dt_d , image_index - 1 , offset , 1 , dt_s , * p );
1682
1708
1683
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1709
+ /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */
1684
1710
1685
1711
if (error_called == 1 )
1686
1712
{
@@ -2802,7 +2828,7 @@ PREFIX (image_status) (int image)
2802
2828
int i ,res = 0 , remote_stat = 0 ,ierr ;
2803
2829
2804
2830
for (i = 0 ;i < n_failed_imgs ;i ++ )
2805
- if (image == ranks_gc [i ])
2831
+ if (image == failed_images_array [i ])
2806
2832
res = STAT_FAILED_IMAGE ;
2807
2833
2808
2834
if (res == STAT_FAILED_IMAGE )
@@ -2832,10 +2858,10 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused
2832
2858
{
2833
2859
int * mem = (int * )calloc (n_failed_imgs ,sizeof (int ));
2834
2860
array -> base_addr = mem ;
2835
- memcpy (mem ,ranks_gc ,n_failed_imgs * sizeof (int ));
2861
+ memcpy (mem ,failed_images_array ,n_failed_imgs * sizeof (int ));
2836
2862
qsort (mem ,n_failed_imgs ,sizeof (int ),cmpfunc );
2837
2863
array -> dtype = 265 ;
2838
- array -> dim [0 ].lower_bound = 0 ;
2864
+ array -> dim [0 ].lower_bound = 1 ;
2839
2865
array -> dim [0 ]._ubound = n_failed_imgs - 1 ;
2840
2866
array -> dim [0 ]._stride = 1 ;
2841
2867
array -> offset = -1 ;
0 commit comments