Skip to content

Commit 5915924

Browse files
authored
Merge pull request #370 from sourceryinstitute/vehre/failed-images
Pull in failed images support. - Fixes #309 - Fixes #354 - Fixes #388 - Fixes #390
2 parents 15de40c + b8e9f56 commit 5915924

16 files changed

+1353
-301
lines changed

CMakeLists.txt

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ set_property ( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYP
88

99
# Add option and check environment to determine if developer tests should be run
1010
if($ENV{OPENCOARRAYS_DEVELOPER})
11-
option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON)
11+
option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON)
1212
else()
13-
option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF)
13+
option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF)
1414
endif()
15-
mark_as_advanced(RUN_DEVELOPER_TESTS)
15+
mark_as_advanced(CAF_RUN_DEVELOPER_TESTS)
1616

1717
if( NOT DEFINED ENV{OPENCOARRAYS_DEVELOPER})
1818
set ( ENV{OPENCOARRAYS_DEVELOPER} FALSE )
@@ -387,7 +387,7 @@ include(GNUInstallDirs)
387387
#-------------------------------
388388
# Recurse into the src directory
389389
#-------------------------------
390-
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
390+
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
391391

392392
add_subdirectory(src)
393393

@@ -465,6 +465,24 @@ function(add_mpi_test name num_mpi_proc path)
465465
set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")
466466
endfunction(add_mpi_test)
467467

468+
function(add_fault_tolerant_mpi_test name num_mpi_proc path)
469+
if ( ((N LESS num_mpi_proc) OR (N EQUAL 0)) )
470+
message(STATUS "Test ${name} is oversubscribed: ${num_mpi_proc} ranks requested with ${N} system processor available.")
471+
if ( openmpi )
472+
if ( N LESS 2 )
473+
set( num_mpi_proc 2 )
474+
set (test_parameters --oversubscribe)
475+
else()
476+
set ( num_mpi_proc ${N} )
477+
endif()
478+
message( STATUS "Open-MPI detected, over-riding oversubscribed test, ${name}, with ${num_mpi_proc} ranks." )
479+
endif()
480+
endif()
481+
set(test_parameters ${test_parameters} ${MPIEXEC_NUMPROC_FLAG} ${num_mpi_proc} -disable-auto-cleanup )
482+
add_test(NAME ${name} COMMAND ${MPIEXEC} ${test_parameters} "${path}")
483+
set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")
484+
endfunction(add_fault_tolerant_mpi_test)
485+
468486
set(tests_root ${CMAKE_CURRENT_BINARY_DIR}/src/tests)
469487

470488

@@ -480,7 +498,7 @@ if(opencoarrays_aware_compiler)
480498
add_mpi_test(register_alloc_comp_1 2 ${tests_root}/unit/init_register/register_alloc_comp_1)
481499
add_mpi_test(register_alloc_comp_2 2 ${tests_root}/unit/init_register/register_alloc_comp_2)
482500
add_mpi_test(register_alloc_comp_3 2 ${tests_root}/unit/init_register/register_alloc_comp_3)
483-
if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
501+
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
484502
message ( STATUS "Running Developer tests is enabled." )
485503
add_mpi_test(async_comp_alloc 6 ${tests_root}/unit/init_register/async_comp_alloc)
486504
# Timeout async_comp_alloc test after 3 seconds to progess past the known failure
@@ -523,16 +541,34 @@ if(opencoarrays_aware_compiler)
523541
# GFortran PR 78505 only fixed on trunk/gcc 7
524542
add_mpi_test(source-alloc-no-sync 8 ${tests_root}/regression/reported/source-alloc-sync)
525543
endif()
526-
if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
544+
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
527545
add_mpi_test(convert-before-put 3 ${tests_root}/regression/reported/convert-before-put)
528546
endif()
529547
add_mpi_test(event-post 3 ${tests_root}/regression/reported/event-post)
530548
add_mpi_test(co_reduce-factorial 4 ${tests_root}/regression/reported/co_reduce-factorial)
531549
add_mpi_test(co_reduce-factorial-int8 4 ${tests_root}/regression/reported/co_reduce-factorial-int8)
532550
add_mpi_test(co_reduce-factorial-int64 4 ${tests_root}/regression/reported/co_reduce-factorial-int64)
533551
add_mpi_test(co_reduce_string 4 ${tests_root}/unit/collectives/co_reduce_string)
534-
# remove this before merging into master
535-
# set_property(TEST co_reduce-factorial PROPERTY WILL_FAIL TRUE)
552+
553+
# IMAGE FAIL tests
554+
if(NOT CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 7)
555+
add_mpi_test(image_status_test_1 4 ${tests_root}/unit/fail_images/image_status_test_1)
556+
if(CAF_ENABLE_FAILED_IMAGES)
557+
# No other way to check that image_fail_test_1 passes.
558+
add_fault_tolerant_mpi_test(image_fail_test_1 4 ${tests_root}/unit/fail_images/image_fail_test_1)
559+
set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed")
560+
set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed")
561+
add_fault_tolerant_mpi_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1)
562+
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
563+
add_fault_tolerant_mpi_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2)
564+
endif()
565+
add_fault_tolerant_mpi_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3)
566+
add_fault_tolerant_mpi_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1)
567+
add_fault_tolerant_mpi_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1)
568+
add_fault_tolerant_mpi_test(image_fail_and_stopped_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_stopped_images_test_1)
569+
add_fault_tolerant_mpi_test(image_fail_and_get_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_get_test_1)
570+
endif()
571+
endif()
536572
else()
537573
add_test(co_sum_extension ${tests_root}/unit/extensions/test-co_sum-extension.sh)
538574
set_property(TEST co_sum_extension PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")

src/libcaf.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
6363
#define STAT_LOCKED_OTHER_IMAGE 2
6464
#define STAT_DUP_SYNC_IMAGES 3
6565
#define STAT_STOPPED_IMAGE 6000
66+
#define STAT_FAILED_IMAGE 6001
6667

6768
/* Describes what type of array we are registerring. Keep in sync with
6869
gcc/fortran/trans.h. */
@@ -88,11 +89,15 @@ typedef enum caf_deregister_t {
8889
caf_deregister_t;
8990

9091
typedef void* caf_token_t;
91-
92+
#ifdef GCC_GE_7
93+
/** Add a dummy type representing teams in coarrays. */
94+
typedef void * caf_team_t;
95+
#endif
9296

9397
/* Linked list of static coarrays registered. */
9498
typedef struct caf_static_t {
9599
caf_token_t token;
100+
caf_token_t stopped_token;
96101
struct caf_static_t *prev;
97102
}
98103
caf_static_t;
@@ -228,13 +233,15 @@ void PREFIX (deregister) (caf_token_t *, int *, char *, int);
228233
#endif
229234

230235
void PREFIX (caf_get) (caf_token_t, size_t, int, gfc_descriptor_t *,
231-
caf_vector_t *, gfc_descriptor_t *, int, int, int);
236+
caf_vector_t *, gfc_descriptor_t *, int, int, bool, int *);
232237
void PREFIX (caf_send) (caf_token_t, size_t, int, gfc_descriptor_t *,
233-
caf_vector_t *, gfc_descriptor_t *, int, int);
238+
caf_vector_t *, gfc_descriptor_t *, int, int, bool,
239+
int *);
234240

235241
void PREFIX (caf_sendget) (caf_token_t, size_t, int, gfc_descriptor_t *,
236242
caf_vector_t *, caf_token_t, size_t, int,
237-
gfc_descriptor_t *, caf_vector_t *, int, int);
243+
gfc_descriptor_t *, caf_vector_t *, int, int, bool,
244+
int *);
238245

239246
#ifdef GCC_GE_7
240247
void PREFIX(get_by_ref) (caf_token_t, int,
@@ -263,9 +270,16 @@ void PREFIX (sync_all) (int *, char *, int);
263270
void PREFIX (sync_images) (int, int[], int *, char *, int);
264271
void PREFIX (sync_memory) (int *, char *, int);
265272

273+
void PREFIX (stop_str) (const char *, int32_t) __attribute__ ((noreturn));
274+
void PREFIX (stop) (int32_t) __attribute__ ((noreturn));
266275
void PREFIX (error_stop_str) (const char *, int32_t)
267276
__attribute__ ((noreturn));
268277
void PREFIX (error_stop) (int32_t) __attribute__ ((noreturn));
278+
void PREFIX (fail_image) (void) __attribute__ ((noreturn));
279+
280+
int PREFIX (image_status) (int);
281+
void PREFIX (failed_images) (gfc_descriptor_t *, int, int *);
282+
void PREFIX (stopped_images) (gfc_descriptor_t *, int, int *);
269283

270284
void PREFIX (atomic_define) (caf_token_t, size_t, int, void *, int *, int, int);
271285
void PREFIX (atomic_ref) (caf_token_t, size_t, int, void *, int *, int, int);

src/mpi/CMakeLists.txt

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,67 @@ if(CAF_EXPOSE_INIT_FINALIZE)
2828
add_definitions(-DEXPOSE_INIT_FINALIZE)
2929
endif()
3030

31+
include(CheckIncludeFile)
32+
CHECK_INCLUDE_FILE("alloca.h" HAVE_ALLOCA)
33+
if(NOT HAVE_ALLOCA)
34+
add_definitions(-DALLOCA_MISSING)
35+
message(WARNING "Could not find <alloca.h>. Assuming functionality is provided elsewhere.")
36+
endif()
37+
38+
#----------------------------------------------------------------------
39+
# Test if MPI implementation provides features needed for failed images
40+
#----------------------------------------------------------------------
41+
set(NEEDED_SYMBOLS MPIX_ERR_PROC_FAILED;MPIX_ERR_REVOKED;MPIX_Comm_failure_ack;MPIX_Comm_failure_get_acked;MPIX_Comm_shrink;MPIX_Comm_agree)
42+
set(MPI_HAS_FAULT_TOL_EXT YES)
43+
set(old_cmake_required_includes "${CMAKE_REQUIRED_INCLUDES}")
44+
if(CMAKE_REQUIRED_INCLUDES)
45+
set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES};${MPI_C_INCLUDE_PATH})
46+
else()
47+
set(CMAKE_REQUIRED_INCLUDES ${MPI_C_INCLUDE_PATH})
48+
endif()
49+
set(old_cmake_required_flags "${CMAKE_REQUIRED_FLAGS}")
50+
if(CMAKE_REQUIRED_FLAGS)
51+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS};${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS})
52+
else()
53+
set(CMAKE_REQUIRED_FLAGS ${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS})
54+
endif()
55+
set(old_cmake_required_libraries "${CMAKE_REQUIRED_LIBRARIES}")
56+
if(CMAKE_REQUIRED_LIBRARIES)
57+
set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES};${MPI_C_LIBRARIES})
58+
else()
59+
set(CMAKE_REQUIRED_LIBRARIES ${MPI_C_LIBRARIES})
60+
endif()
61+
62+
set(MPI_HEADERS mpi.h)
63+
CHECK_INCLUDE_FILE("mpi-ext.h" HAVE_MPI_EXT)
64+
if(HAVE_MPI_EXT)
65+
add_definitions(-DHAVE_MPI_EXT_H)
66+
set(MPI_HEADERS ${MPI_HEADERS};mpi-ext.h)
67+
endif()
68+
include(CheckSymbolExists)
69+
foreach(symbol ${NEEDED_SYMBOLS})
70+
CHECK_SYMBOL_EXISTS(${symbol} ${MPI_HEADERS} HAVE_${symbol})
71+
if(NOT HAVE_${symbol})
72+
message( STATUS "\${HAVE_${symbol}} = ${HAVE_${symbol}}")
73+
message( WARNING "Disabling Failed Image support due to lack of support in the current MPI implementation.")
74+
set(MPI_HAS_FAULT_TOL_EXT NO)
75+
break() # no need to keep looking
76+
endif()
77+
endforeach(symbol)
78+
set(CMAKE_REQUIRED_INCLUDES ${old_cmake_required_includes})
79+
set(CMAKE_REQUIRED_FLAGS ${old_cmake_required_flags})
80+
set(CMAKE_REQUIRED_LIBRARIES ${old_cmake_required_libraries})
81+
82+
if(MPI_HAS_FAULT_TOL_EXT)
83+
option(CAF_ENABLE_FAILED_IMAGES "Enable failed images support" TRUE)
84+
else()
85+
set(CAF_ENABLE_FAILED_IMAGES FALSE CACHE BOOL "Enable failed images support" FORCE)
86+
endif()
87+
88+
if(CAF_ENABLE_FAILED_IMAGES)
89+
add_definitions(-DUSE_FAILED_IMAGES)
90+
endif()
91+
3192
# Determine whether and how to include OpenCoarrays module based on if the Fortran MPI compiler:
3293
# - workds
3394
# - is compatible with the fortran compiler used to build the MPI implementation

0 commit comments

Comments
 (0)