Skip to content

Commit 120210a

Browse files
committed
Do not shrink communicator when number of failed images is out of bounds.
That the number of images failed is reported incorrectly, may happen when in the finalization stage. To prevent this, the mpi-error handler now checks for the correct number and on incorrect ones just exists ok. This commit should Fix #390.
1 parent 9674d71 commit 120210a

File tree

1 file changed

+19
-2
lines changed

1 file changed

+19
-2
lines changed

src/mpi/mpi_caf.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,11 @@ failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...)
394394
*perr = MPI_SUCCESS;
395395
return;
396396
}
397+
if (num_failed_in_group > caf_num_images)
398+
{
399+
*perr = MPI_SUCCESS;
400+
return;
401+
}
397402

398403
MPI_Comm_group (comm, &comm_world_group);
399404
ranks_of_failed_in_comm_world = (int *) alloca (sizeof (int)
@@ -451,8 +456,20 @@ failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...)
451456
/* TODO: Consider whether removing the failed image from images_full will be
452457
* necessary. This is more or less politics. */
453458
for (i = 0; i < num_failed_in_group; ++i)
454-
if (image_stati[ranks_of_failed_in_comm_world[i]] == 0)
455-
image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE;
459+
{
460+
if (ranks_of_failed_in_comm_world[i] >= 0
461+
&& ranks_of_failed_in_comm_world[i] < caf_num_images)
462+
{
463+
if (image_stati[ranks_of_failed_in_comm_world[i]] == 0)
464+
image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE;
465+
}
466+
else
467+
{
468+
dprint ("%d/%d: Rank of failed image %d out of range of images 0..%d.\n",
469+
caf_this_image, caf_num_images, ranks_of_failed_in_comm_world[i],
470+
caf_num_images);
471+
}
472+
}
456473

457474
redo:
458475
dprint ("%d/%d: %s: Before shrink. \n", caf_this_image, caf_num_images, __FUNCTION__);

0 commit comments

Comments
 (0)