@@ -278,6 +278,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
278
278
279
279
if (error_called == 1 )
280
280
{
281
+ MPIX_Comm_revoke (CAF_COMM_WORLD );
281
282
communicator_shrink (& CAF_COMM_WORLD );
282
283
error_called = 0 ;
283
284
}
@@ -308,6 +309,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
308
309
309
310
if (error_called == 1 )
310
311
{
312
+ MPIX_Comm_revoke (CAF_COMM_WORLD );
311
313
communicator_shrink (& CAF_COMM_WORLD );
312
314
error_called = 0 ;
313
315
ierr = STAT_FAILED_IMAGE ;
@@ -369,6 +371,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat,
369
371
370
372
if (error_called == 1 )
371
373
{
374
+ MPIX_Comm_revoke (CAF_COMM_WORLD );
372
375
communicator_shrink (& CAF_COMM_WORLD );
373
376
error_called = 0 ;
374
377
ierr = STAT_FAILED_IMAGE ;
@@ -592,9 +595,8 @@ int communicator_shrink(MPI_Comm *comm)
592
595
MPI_Comm shrunk , * newcomm ;
593
596
MPI_Group cgrp , sgrp , dgrp ;
594
597
595
- redo :
596
598
newcomm = (MPI_Comm * )calloc (1 ,sizeof (MPI_Comm ));
597
-
599
+ redo :
598
600
MPIX_Comm_shrink (* comm , & shrunk );
599
601
MPI_Comm_set_errhandler ( shrunk , errh );
600
602
MPI_Comm_size (shrunk , & ns ); MPI_Comm_rank (shrunk , & srank );
@@ -606,7 +608,7 @@ int communicator_shrink(MPI_Comm *comm)
606
608
if (* img_status == STAT_STOPPED_IMAGE )
607
609
crank = -1 ;
608
610
rc = MPI_Comm_split (shrunk , crank < 0 ?MPI_UNDEFINED :1 , crank , newcomm );
609
-
611
+ flag = ( rc == MPI_SUCCESS );
610
612
/* Split or some of the communications above may have failed if
611
613
* new failures have disrupted the process: we need to
612
614
* make sure we succeeded at all ranks, or retry until it works. */
@@ -687,6 +689,12 @@ void *
687
689
688
690
MPI_Win_set_errhandler (* p ,errh_w );
689
691
692
+ if (error_called == 1 )
693
+ {
694
+ communicator_shrink (& CAF_COMM_WORLD );
695
+ error_called = 0 ;
696
+ }
697
+
690
698
if (l_var )
691
699
{
692
700
init_array = (int * )calloc (size , sizeof (int ));
@@ -704,17 +712,13 @@ void *
704
712
/* PREFIX(sync_all) (NULL,NULL,0); */
705
713
}
706
714
707
- /* MPI_Barrier(CAF_COMM_WORLD); */
708
-
709
- /* if(error_called == 1) */
710
- /* { */
711
- /* communicator_shrink(&CAF_COMM_WORLD); */
712
- /* error_called = 0; */
713
- /* ierr = STAT_FAILED_IMAGE; */
714
- /* } */
715
+ if (error_called == 1 )
716
+ {
717
+ MPIX_Comm_revoke (CAF_COMM_WORLD );
718
+ communicator_shrink (& CAF_COMM_WORLD );
719
+ error_called = 0 ;
720
+ }
715
721
716
- /* MPI_Win_create_dynamic(MPI_INFO_NULL, stopped_comm, stopped_win); */
717
-
718
722
caf_static_t * tmp = malloc (sizeof (caf_static_t ));
719
723
tmp -> prev = caf_tot ;
720
724
tmp -> token = * token ;
@@ -794,6 +798,13 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len
794
798
/* PREFIX (sync_all) (NULL, NULL, 0); */
795
799
MPI_Barrier (CAF_COMM_WORLD );
796
800
801
+ if (error_called == 1 )
802
+ {
803
+ communicator_shrink (& CAF_COMM_WORLD );
804
+ error_called = 0 ;
805
+ MPI_Barrier (CAF_COMM_WORLD );
806
+ }
807
+
797
808
caf_static_t * tmp = caf_tot , * prev = caf_tot , * next = caf_tot ;
798
809
MPI_Win * p = * token ;
799
810
@@ -828,10 +839,6 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len
828
839
if (stat )
829
840
* stat = 0 ;
830
841
831
- /* if (unlikely (ierr = ARMCI_Free ((*token)[caf_this_image-1]))) */
832
- /* caf_runtime_error ("ARMCI memory freeing failed: Error code %d", ierr); */
833
- //gasnet_exit(0);
834
-
835
842
free (* token );
836
843
}
837
844
@@ -848,13 +855,6 @@ void
848
855
PREFIX (sync_all ) (int * stat , char * errmsg , int errmsg_len )
849
856
{
850
857
int ierr = 0 ,flag = 0 ;
851
-
852
- /* if(error_called == 1) */
853
- /* { */
854
- /* communicator_shrink(&CAF_COMM_WORLD); */
855
- /* error_called = 0; */
856
- /* ierr = STAT_FAILED_IMAGE; */
857
- /* } */
858
858
859
859
if (unlikely (caf_is_finalized ))
860
860
ierr = STAT_STOPPED_IMAGE ;
@@ -871,6 +871,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
871
871
communicator_shrink (& CAF_COMM_WORLD );
872
872
error_called = 0 ;
873
873
ierr = STAT_FAILED_IMAGE ;
874
+ MPI_Barrier (CAF_COMM_WORLD );
874
875
}
875
876
876
877
if (stat )
@@ -944,6 +945,7 @@ void selectType(int size, MPI_Datatype *dt)
944
945
945
946
}
946
947
948
+ /* Not yet adapted for failed images */
947
949
void
948
950
PREFIX (sendget ) (caf_token_t token_s , size_t offset_s , int image_index_s ,
949
951
gfc_descriptor_t * dest ,
@@ -1326,65 +1328,29 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1326
1328
MPI_Win_flush (image_index - 1 , * p );
1327
1329
# endif // CAF_MPI_LOCK_UNLOCK
1328
1330
1329
- if (ierr != 0 )
1330
- {
1331
- error_stop (ierr );
1332
- return ;
1333
- }
1334
-
1335
- MPI_Type_free (& dt_s );
1336
- MPI_Type_free (& dt_d );
1337
-
1338
- /* msg = 2; */
1339
- /* MPI_Pack(&msg, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1340
- /* MPI_Pack(&rank, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1341
-
1342
- /* for(j=0;j<rank;j++) */
1343
- /* { */
1344
- /* MPI_Pack(&(dest->dim[j]._stride), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1345
- /* MPI_Pack(&(dest->dim[j].lower_bound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1346
- /* MPI_Pack(&(dest->dim[j]._ubound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1347
- /* } */
1348
-
1349
- /* MPI_Pack(&size, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1350
-
1351
- /* /\* non-blocking send *\/ */
1352
-
1353
- /* MPI_Issend(buff_am[caf_this_image], position, MPI_PACKED, image_index-1, 1, CAF_COMM_WORLD, &reqdt); */
1354
-
1355
- /* msgbody = calloc(size, sizeof(char)); */
1331
+ MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1332
+
1333
+ if (error_called == 1 )
1334
+ {
1335
+ communicator_shrink (& CAF_COMM_WORLD );
1336
+ error_called = 0 ;
1337
+ ierr = STAT_FAILED_IMAGE ;
1338
+ }
1339
+
1340
+ if (!stat && ierr == STAT_FAILED_IMAGE )
1341
+ error_stop (ierr );
1356
1342
1357
- /* ptrdiff_t array_offset_sr = 0; */
1358
- /* ptrdiff_t stride = 1; */
1359
- /* ptrdiff_t extent = 1; */
1343
+ if (stat )
1344
+ * stat = ierr ;
1360
1345
1361
- /* for(i = 0; i < size; i++ ) */
1346
+ /* if (ierr ! = 0) */
1362
1347
/* { */
1363
- /* for (j = 0; j < GFC_DESCRIPTOR_RANK (src)-1; j++) */
1364
- /* { */
1365
- /* array_offset_sr += ((i / (extent*stride)) */
1366
- /* % (src->dim[j]._ubound */
1367
- /* - src->dim[j].lower_bound + 1)) */
1368
- /* * src->dim[j]._stride; */
1369
- /* extent = (src->dim[j]._ubound - src->dim[j].lower_bound + 1); */
1370
- /* stride = src->dim[j]._stride; */
1371
- /* } */
1372
-
1373
- /* array_offset_sr += (i / extent) * src->dim[rank-1]._stride; */
1374
-
1375
- /* void *sr = (void *)((char *) src->base_addr */
1376
- /* + array_offset_sr*GFC_DESCRIPTOR_SIZE (src)); */
1377
-
1378
- /* memmove (msgbody+p_mb, sr, GFC_DESCRIPTOR_SIZE (src)); */
1379
-
1380
- /* p_mb += GFC_DESCRIPTOR_SIZE (src); */
1348
+ /* error_stop (ierr); */
1349
+ /* return; */
1381
1350
/* } */
1382
1351
1383
- /* MPI_Wait(&reqdt, &stadt); */
1384
-
1385
- /* MPI_Ssend(msgbody, size, MPI_BYTE, image_index-1, 1, CAF_COMM_WORLD); */
1386
-
1387
- /* free(msgbody); */
1352
+ MPI_Type_free (& dt_s );
1353
+ MPI_Type_free (& dt_d );
1388
1354
1389
1355
#else
1390
1356
if (caf_this_image == image_index && mrt )
@@ -1459,12 +1425,12 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1459
1425
ierr = MPI_Put (pad_str , dst_size - src_size , MPI_BYTE , image_index - 1 ,
1460
1426
dst_offset , dst_size - src_size , MPI_BYTE , * p );
1461
1427
}
1462
-
1463
- if (ierr != 0 )
1464
- {
1465
- error_stop (ierr );
1466
- return ;
1467
- }
1428
+
1429
+ /* if (ierr != 0) */
1430
+ /* { */
1431
+ /* error_stop (ierr); */
1432
+ /* return; */
1433
+ /* } */
1468
1434
}
1469
1435
1470
1436
if (caf_this_image == image_index && mrt )
@@ -1503,6 +1469,20 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
1503
1469
MPI_Win_flush (image_index - 1 , * p );
1504
1470
# endif // CAF_MPI_LOCK_UNLOCK
1505
1471
#endif
1472
+ MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1473
+
1474
+ if (error_called == 1 )
1475
+ {
1476
+ communicator_shrink (& CAF_COMM_WORLD );
1477
+ error_called = 0 ;
1478
+ ierr = STAT_FAILED_IMAGE ;
1479
+ }
1480
+
1481
+ if (!stat && ierr == STAT_FAILED_IMAGE )
1482
+ error_stop (ierr );
1483
+
1484
+ if (stat )
1485
+ * stat = ierr ;
1506
1486
}
1507
1487
}
1508
1488
@@ -1684,23 +1664,26 @@ PREFIX (get) (caf_token_t token, size_t offset,
1684
1664
1685
1665
//sr_off = offset;
1686
1666
1687
- MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1667
+ # ifdef CAF_MPI_LOCK_UNLOCK
1668
+ MPI_Win_lock (MPI_LOCK_SHARED , image_index - 1 , 0 , * p );
1669
+ # endif // CAF_MPI_LOCK_UNLOCK
1688
1670
1671
+ ierr = MPI_Get (dst , 1 , dt_d , image_index - 1 , offset , 1 , dt_s , * p );
1672
+
1673
+ MPI_Test (& lock_req ,& flag ,MPI_STATUS_IGNORE );
1674
+
1689
1675
if (error_called == 1 )
1690
1676
{
1691
1677
communicator_shrink (& CAF_COMM_WORLD );
1692
1678
error_called = 0 ;
1693
1679
ierr = STAT_FAILED_IMAGE ;
1694
1680
}
1695
-
1681
+
1696
1682
if (!stat && ierr == STAT_FAILED_IMAGE )
1697
1683
error_stop (ierr );
1698
1684
1699
- # ifdef CAF_MPI_LOCK_UNLOCK
1700
- MPI_Win_lock (MPI_LOCK_SHARED , image_index - 1 , 0 , * p );
1701
- # endif // CAF_MPI_LOCK_UNLOCK
1702
-
1703
- ierr = MPI_Get (dst , 1 , dt_d , image_index - 1 , offset , 1 , dt_s , * p );
1685
+ if (stat )
1686
+ * stat = ierr ;
1704
1687
1705
1688
# ifdef CAF_MPI_LOCK_UNLOCK
1706
1689
MPI_Win_unlock (image_index - 1 , * p );
0 commit comments