Skip to content

Commit 59ce1e5

Browse files
author
Alessandro Fanfarillo
committed
General improvement after tutorial
1 parent cba6ef5 commit 59ce1e5

File tree

1 file changed

+74
-91
lines changed

1 file changed

+74
-91
lines changed

src/mpi/mpi_caf.c

Lines changed: 74 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
278278

279279
if(error_called == 1)
280280
{
281+
MPIX_Comm_revoke(CAF_COMM_WORLD);
281282
communicator_shrink(&CAF_COMM_WORLD);
282283
error_called = 0;
283284
}
@@ -308,6 +309,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat,
308309

309310
if(error_called == 1)
310311
{
312+
MPIX_Comm_revoke(CAF_COMM_WORLD);
311313
communicator_shrink(&CAF_COMM_WORLD);
312314
error_called = 0;
313315
ierr = STAT_FAILED_IMAGE;
@@ -369,6 +371,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat,
369371

370372
if(error_called == 1)
371373
{
374+
MPIX_Comm_revoke(CAF_COMM_WORLD);
372375
communicator_shrink(&CAF_COMM_WORLD);
373376
error_called = 0;
374377
ierr = STAT_FAILED_IMAGE;
@@ -592,9 +595,8 @@ int communicator_shrink(MPI_Comm *comm)
592595
MPI_Comm shrunk, *newcomm;
593596
MPI_Group cgrp, sgrp, dgrp;
594597

595-
redo:
596598
newcomm = (MPI_Comm *)calloc(1,sizeof(MPI_Comm));
597-
599+
redo:
598600
MPIX_Comm_shrink(*comm, &shrunk);
599601
MPI_Comm_set_errhandler( shrunk, errh );
600602
MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank);
@@ -606,7 +608,7 @@ int communicator_shrink(MPI_Comm *comm)
606608
if (*img_status == STAT_STOPPED_IMAGE)
607609
crank = -1;
608610
rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm);
609-
611+
flag = (rc == MPI_SUCCESS);
610612
/* Split or some of the communications above may have failed if
611613
* new failures have disrupted the process: we need to
612614
* make sure we succeeded at all ranks, or retry until it works. */
@@ -687,6 +689,12 @@ void *
687689

688690
MPI_Win_set_errhandler(*p,errh_w);
689691

692+
if(error_called == 1)
693+
{
694+
communicator_shrink(&CAF_COMM_WORLD);
695+
error_called = 0;
696+
}
697+
690698
if(l_var)
691699
{
692700
init_array = (int *)calloc(size, sizeof(int));
@@ -704,17 +712,13 @@ void *
704712
/* PREFIX(sync_all) (NULL,NULL,0); */
705713
}
706714

707-
/* MPI_Barrier(CAF_COMM_WORLD); */
708-
709-
/* if(error_called == 1) */
710-
/* { */
711-
/* communicator_shrink(&CAF_COMM_WORLD); */
712-
/* error_called = 0; */
713-
/* ierr = STAT_FAILED_IMAGE; */
714-
/* } */
715+
if(error_called == 1)
716+
{
717+
MPIX_Comm_revoke(CAF_COMM_WORLD);
718+
communicator_shrink(&CAF_COMM_WORLD);
719+
error_called = 0;
720+
}
715721

716-
/* MPI_Win_create_dynamic(MPI_INFO_NULL, stopped_comm, stopped_win); */
717-
718722
caf_static_t *tmp = malloc (sizeof (caf_static_t));
719723
tmp->prev = caf_tot;
720724
tmp->token = *token;
@@ -794,6 +798,13 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len
794798
/* PREFIX (sync_all) (NULL, NULL, 0); */
795799
MPI_Barrier(CAF_COMM_WORLD);
796800

801+
if(error_called == 1)
802+
{
803+
communicator_shrink(&CAF_COMM_WORLD);
804+
error_called = 0;
805+
MPI_Barrier(CAF_COMM_WORLD);
806+
}
807+
797808
caf_static_t *tmp = caf_tot, *prev = caf_tot, *next=caf_tot;
798809
MPI_Win *p = *token;
799810

@@ -828,10 +839,6 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len
828839
if (stat)
829840
*stat = 0;
830841

831-
/* if (unlikely (ierr = ARMCI_Free ((*token)[caf_this_image-1]))) */
832-
/* caf_runtime_error ("ARMCI memory freeing failed: Error code %d", ierr); */
833-
//gasnet_exit(0);
834-
835842
free (*token);
836843
}
837844

@@ -848,13 +855,6 @@ void
848855
PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
849856
{
850857
int ierr=0,flag=0;
851-
852-
/* if(error_called == 1) */
853-
/* { */
854-
/* communicator_shrink(&CAF_COMM_WORLD); */
855-
/* error_called = 0; */
856-
/* ierr = STAT_FAILED_IMAGE; */
857-
/* } */
858858

859859
if (unlikely (caf_is_finalized))
860860
ierr = STAT_STOPPED_IMAGE;
@@ -871,6 +871,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len)
871871
communicator_shrink(&CAF_COMM_WORLD);
872872
error_called = 0;
873873
ierr = STAT_FAILED_IMAGE;
874+
MPI_Barrier(CAF_COMM_WORLD);
874875
}
875876

876877
if (stat)
@@ -944,6 +945,7 @@ void selectType(int size, MPI_Datatype *dt)
944945

945946
}
946947

948+
/* Not yet adapted for failed images */
947949
void
948950
PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s,
949951
gfc_descriptor_t *dest,
@@ -1326,65 +1328,29 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
13261328
MPI_Win_flush (image_index-1, *p);
13271329
# endif // CAF_MPI_LOCK_UNLOCK
13281330

1329-
if (ierr != 0)
1330-
{
1331-
error_stop (ierr);
1332-
return;
1333-
}
1334-
1335-
MPI_Type_free (&dt_s);
1336-
MPI_Type_free (&dt_d);
1337-
1338-
/* msg = 2; */
1339-
/* MPI_Pack(&msg, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1340-
/* MPI_Pack(&rank, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1341-
1342-
/* for(j=0;j<rank;j++) */
1343-
/* { */
1344-
/* MPI_Pack(&(dest->dim[j]._stride), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1345-
/* MPI_Pack(&(dest->dim[j].lower_bound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1346-
/* MPI_Pack(&(dest->dim[j]._ubound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1347-
/* } */
1348-
1349-
/* MPI_Pack(&size, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */
1350-
1351-
/* /\* non-blocking send *\/ */
1352-
1353-
/* MPI_Issend(buff_am[caf_this_image], position, MPI_PACKED, image_index-1, 1, CAF_COMM_WORLD, &reqdt); */
1354-
1355-
/* msgbody = calloc(size, sizeof(char)); */
1331+
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1332+
1333+
if(error_called == 1)
1334+
{
1335+
communicator_shrink(&CAF_COMM_WORLD);
1336+
error_called = 0;
1337+
ierr = STAT_FAILED_IMAGE;
1338+
}
1339+
1340+
if(!stat && ierr == STAT_FAILED_IMAGE)
1341+
error_stop (ierr);
13561342

1357-
/* ptrdiff_t array_offset_sr = 0; */
1358-
/* ptrdiff_t stride = 1; */
1359-
/* ptrdiff_t extent = 1; */
1343+
if(stat)
1344+
*stat = ierr;
13601345

1361-
/* for(i = 0; i < size; i++) */
1346+
/* if (ierr != 0) */
13621347
/* { */
1363-
/* for (j = 0; j < GFC_DESCRIPTOR_RANK (src)-1; j++) */
1364-
/* { */
1365-
/* array_offset_sr += ((i / (extent*stride)) */
1366-
/* % (src->dim[j]._ubound */
1367-
/* - src->dim[j].lower_bound + 1)) */
1368-
/* * src->dim[j]._stride; */
1369-
/* extent = (src->dim[j]._ubound - src->dim[j].lower_bound + 1); */
1370-
/* stride = src->dim[j]._stride; */
1371-
/* } */
1372-
1373-
/* array_offset_sr += (i / extent) * src->dim[rank-1]._stride; */
1374-
1375-
/* void *sr = (void *)((char *) src->base_addr */
1376-
/* + array_offset_sr*GFC_DESCRIPTOR_SIZE (src)); */
1377-
1378-
/* memmove (msgbody+p_mb, sr, GFC_DESCRIPTOR_SIZE (src)); */
1379-
1380-
/* p_mb += GFC_DESCRIPTOR_SIZE (src); */
1348+
/* error_stop (ierr); */
1349+
/* return; */
13811350
/* } */
13821351

1383-
/* MPI_Wait(&reqdt, &stadt); */
1384-
1385-
/* MPI_Ssend(msgbody, size, MPI_BYTE, image_index-1, 1, CAF_COMM_WORLD); */
1386-
1387-
/* free(msgbody); */
1352+
MPI_Type_free (&dt_s);
1353+
MPI_Type_free (&dt_d);
13881354

13891355
#else
13901356
if(caf_this_image == image_index && mrt)
@@ -1459,12 +1425,12 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
14591425
ierr = MPI_Put (pad_str, dst_size - src_size, MPI_BYTE, image_index-1,
14601426
dst_offset, dst_size - src_size, MPI_BYTE, *p);
14611427
}
1462-
1463-
if (ierr != 0)
1464-
{
1465-
error_stop (ierr);
1466-
return;
1467-
}
1428+
1429+
/* if (ierr != 0) */
1430+
/* { */
1431+
/* error_stop (ierr); */
1432+
/* return; */
1433+
/* } */
14681434
}
14691435

14701436
if(caf_this_image == image_index && mrt)
@@ -1503,6 +1469,20 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index,
15031469
MPI_Win_flush (image_index-1, *p);
15041470
# endif // CAF_MPI_LOCK_UNLOCK
15051471
#endif
1472+
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1473+
1474+
if(error_called == 1)
1475+
{
1476+
communicator_shrink(&CAF_COMM_WORLD);
1477+
error_called = 0;
1478+
ierr = STAT_FAILED_IMAGE;
1479+
}
1480+
1481+
if(!stat && ierr == STAT_FAILED_IMAGE)
1482+
error_stop (ierr);
1483+
1484+
if(stat)
1485+
*stat = ierr;
15061486
}
15071487
}
15081488

@@ -1684,23 +1664,26 @@ PREFIX (get) (caf_token_t token, size_t offset,
16841664

16851665
//sr_off = offset;
16861666

1687-
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1667+
# ifdef CAF_MPI_LOCK_UNLOCK
1668+
MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p);
1669+
# endif // CAF_MPI_LOCK_UNLOCK
16881670

1671+
ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p);
1672+
1673+
MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE);
1674+
16891675
if(error_called == 1)
16901676
{
16911677
communicator_shrink(&CAF_COMM_WORLD);
16921678
error_called = 0;
16931679
ierr = STAT_FAILED_IMAGE;
16941680
}
1695-
1681+
16961682
if(!stat && ierr == STAT_FAILED_IMAGE)
16971683
error_stop (ierr);
16981684

1699-
# ifdef CAF_MPI_LOCK_UNLOCK
1700-
MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p);
1701-
# endif // CAF_MPI_LOCK_UNLOCK
1702-
1703-
ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p);
1685+
if(stat)
1686+
*stat = ierr;
17041687

17051688
# ifdef CAF_MPI_LOCK_UNLOCK
17061689
MPI_Win_unlock (image_index-1, *p);

0 commit comments

Comments
 (0)