From 32645445be0ca39f00cd62d581d0dd991eb4f8da Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 16 Oct 2025 07:37:44 -0700 Subject: [PATCH 1/3] More reliable MCA parameters in CI tests --- .github/docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/docker-compose.yml b/.github/docker-compose.yml index 2212b0c..5291752 100644 --- a/.github/docker-compose.yml +++ b/.github/docker-compose.yml @@ -41,6 +41,9 @@ services: -DBUILD_TESTING=ON \ -DMPIEXEC_PREFLAGS="--allow-run-as-root;--map-by;:oversubscribe" && \ make -j + + ENV OMPI_MCA_coll=^han + ENV OMPI_MCA_btl=tcp,sm,self WORKDIR /fenix/build ENTRYPOINT ["/entrypoint.sh"] From 571b5440c224c668472d0107ff5b10f76afafdee Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Fri, 25 Apr 2025 10:08:15 -0500 Subject: [PATCH 2/3] Support c++ lambda callbacks --- include/fenix.h | 2 + include/fenix.hpp | 74 ++++++++++++++++++++++++++++++ include/fenix_ext.hpp | 5 +- include/fenix_process_recovery.hpp | 19 ++------ src/CMakeLists.txt | 2 + src/fenix.cpp | 10 +++- src/fenix_callbacks.cpp | 64 ++++---------------------- src/fenix_process_recovery.cpp | 12 ++--- 8 files changed, 106 insertions(+), 82 deletions(-) create mode 100644 include/fenix.hpp diff --git a/include/fenix.h b/include/fenix.h index 28af25b..c812ca4 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -61,6 +61,8 @@ #include #if defined(c_plusplus) || defined(__cplusplus) +#include "fenix.hpp" + extern "C" { #endif diff --git a/include/fenix.hpp b/include/fenix.hpp new file mode 100644 index 0000000..ff69498 --- /dev/null +++ b/include/fenix.hpp @@ -0,0 +1,74 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + + +#ifndef __FENIX_HPP__ +#define __FENIX_HPP__ + +#include +#include +#include "fenix.h" + +/** + * @brief As the C-style callback, but accepts an std::function and does not use the void* pointer. + * + * @param[in] callback The function to register. + * + * @returnstatus + */ +int Fenix_Callback_register(std::function callback); + +#endif diff --git a/include/fenix_ext.hpp b/include/fenix_ext.hpp index 9930798..0026325 100644 --- a/include/fenix_ext.hpp +++ b/include/fenix_ext.hpp @@ -58,6 +58,7 @@ #define __FENIX_EXT_H__ #include +#include #include "fenix.h" #include "fenix_opt.hpp" #include "fenix_data_group.hpp" @@ -77,7 +78,7 @@ typedef struct { //enum FenixRankRole role; // Role of rank: initial, survivor or repair int role; // Role of rank: initial, survivor or repair - int fenix_init_flag; + int fenix_init_flag = 0; int fail_world_size; int* fail_world; @@ -86,7 +87,7 @@ typedef struct { int *ret_role; int *ret_error; - fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions + std::vector callbacks; fenix_debug_opt_t options; // This is reserved to store the user options MPI_Comm *world; // Duplicate of the MPI communicator provided by user diff --git a/include/fenix_process_recovery.hpp b/include/fenix_process_recovery.hpp index 1132992..f6ad346 100644 --- a/include/fenix_process_recovery.hpp +++ b/include/fenix_process_recovery.hpp @@ -67,21 +67,12 @@ #include #include "fenix_init.h" +#include #define __FENIX_RESUME_AT_INIT 0 #define __FENIX_RESUME_NO_JUMP 200 -typedef void (*recover)( MPI_Comm, int, void *); - -typedef struct fcouple { - recover x; - void *y; -} fenix_callback_func; - -typedef struct __fenix_callback_list { - fenix_callback_func *callback; - struct __fenix_callback_list *next; -} fenix_callback_list_t; +using fenix_callback_func = std::function; typedef struct __fenix_comm_list_elm { struct __fenix_comm_list_elm *next; @@ -98,16 +89,12 @@ int __fenix_create_new_world(); int __fenix_repair_ranks(); -int __fenix_callback_register(void (*recover)(MPI_Comm, int, void *), void *); +int __fenix_callback_register(fenix_callback_func& recover); int __fenix_callback_pop(); -void __fenix_callback_push(fenix_callback_list_t **, fenix_callback_func *); - void __fenix_callback_invoke_all(int error); -int __fenix_callback_destroy(fenix_callback_list_t *callback_list); - int* __fenix_get_fail_ranks(int *, int, int); int __fenix_spare_rank(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4764c7b..0256344 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,8 @@ globals.cpp add_library( fenix STATIC ${Fenix_SOURCES}) +target_compile_features(fenix PRIVATE cxx_std_17) + target_link_libraries(fenix PUBLIC MPI::MPI_CXX) target_include_directories(fenix diff --git a/src/fenix.cpp b/src/fenix.cpp index 525f57d..a383ad6 100644 --- a/src/fenix.cpp +++ b/src/fenix.cpp @@ -58,13 +58,19 @@ #include "fenix_process_recovery.hpp" #include "fenix_util.hpp" #include "fenix_ext.hpp" -#include "fenix.h" +#include "fenix.hpp" const Fenix_Data_subset FENIX_DATA_SUBSET_FULL = {0, NULL, NULL, NULL, 0, __FENIX_SUBSET_FULL}; const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY = {0, NULL, NULL, NULL, 0, __FENIX_SUBSET_EMPTY}; +int Fenix_Callback_register(std::function callback){ + return __fenix_callback_register(callback); +} + int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data) { - return __fenix_callback_register(recover, callback_data); + return Fenix_Callback_register([recover, callback_data](MPI_Comm comm, int fenix_error){ + recover(comm, fenix_error, callback_data); + }); } int Fenix_Callback_pop() { diff --git a/src/fenix_callbacks.cpp b/src/fenix_callbacks.cpp index 400535e..5f981ba 100644 --- a/src/fenix_callbacks.cpp +++ b/src/fenix_callbacks.cpp @@ -65,71 +65,27 @@ #include -int __fenix_callback_register(void (*recover)(MPI_Comm, int, void *), void *callback_data) +int __fenix_callback_register(fenix_callback_func& recover) { - int error_code = FENIX_SUCCESS; - if (fenix.fenix_init_flag) { - fenix_callback_func *fp = (fenix_callback_func *) s_malloc(sizeof(fenix_callback_func)); - fp->x = recover; - fp->y = callback_data; - __fenix_callback_push( &fenix.callback_list, fp); - } else { - error_code = FENIX_ERROR_UNINITIALIZED; - } - return error_code; + if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; + + fenix.callbacks.push_back(recover); + + return FENIX_SUCCESS; } int __fenix_callback_pop(){ if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; - if(fenix.callback_list == NULL) return FENIX_ERROR_CALLBACK_NOT_REGISTERED; - - fenix_callback_list_t* old_head = fenix.callback_list; - fenix.callback_list = old_head->next; + if(fenix.callbacks.empty()) return FENIX_ERROR_CALLBACK_NOT_REGISTERED; - free(old_head->callback); - free(old_head); + fenix.callbacks.pop_back(); return FENIX_SUCCESS; } void __fenix_callback_invoke_all(int error) { - fenix_callback_list_t *current = fenix.callback_list; - while (current != NULL) { - (current->callback->x)((MPI_Comm) fenix.new_world, error, - (void *) current->callback->y); - current = current->next; - } -} - -void __fenix_callback_push(fenix_callback_list_t **head, fenix_callback_func *fp) -{ - fenix_callback_list_t *callback = (fenix_callback_list_t *) malloc(sizeof(fenix_callback_list_t)); - callback->callback = fp; - callback->next = *head; - *head = callback; -} - -int __fenix_callback_destroy(fenix_callback_list_t *callback_list) -{ - int error_code = FENIX_SUCCESS; - - if ( fenix.fenix_init_flag ) { - - fenix_callback_list_t *current = callback_list; - - while (current != NULL) { - fenix_callback_list_t *old; - old = current; - current = current->next; - free( old->callback ); - free( old ); - } - - } else { - error_code = FENIX_ERROR_UNINITIALIZED; + for(auto it = fenix.callbacks.rbegin(); it != fenix.callbacks.rend(); it++){ + (*it)(*fenix.user_world, error); } - - return error_code; } - diff --git a/src/fenix_process_recovery.cpp b/src/fenix_process_recovery.cpp index 18d7fea..f785d15 100644 --- a/src/fenix_process_recovery.cpp +++ b/src/fenix_process_recovery.cpp @@ -785,13 +785,11 @@ void __fenix_finalize() free(fenix.fail_world); } - /* Free Callbacks */ - __fenix_callback_destroy( fenix.callback_list ); - /* Free data recovery interface */ __fenix_data_recovery_destroy( fenix.data_recovery ); - fenix.fenix_init_flag = 0; + /* Free up any C++ data structures, reset default variables */ + fenix = {}; } void __fenix_finalize_spare() @@ -823,13 +821,11 @@ void __fenix_finalize_spare() MPI_Comm_set_errhandler(*fenix.world, MPI_ERRORS_ARE_FATAL); MPI_Comm_free(fenix.world); - /* Free callbacks */ - __fenix_callback_destroy( fenix.callback_list ); - /* Free data recovery interface */ __fenix_data_recovery_destroy( fenix.data_recovery ); - fenix.fenix_init_flag = 0; + /* Free up any C++ data structures, reset default variables */ + fenix = {}; /* Future version do not close MPI. Jump to where Fenix_Finalize is called. */ MPI_Finalize(); From b058b21a40ed32ed5e6b3eaa8178cf199c20e248 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Fri, 25 Apr 2025 10:12:20 -0500 Subject: [PATCH 3/3] Add Fenix::CommException --- include/fenix.hpp | 15 ++++ include/fenix_exception.hpp | 74 +++++++++++++++ src/CMakeLists.txt | 1 + src/fenix_exception.cpp | 14 +++ test/CMakeLists.txt | 1 + test/exception_throw/CMakeLists.txt | 15 ++++ test/exception_throw/fenix_exceptions.cpp | 104 ++++++++++++++++++++++ 7 files changed, 224 insertions(+) create mode 100644 include/fenix_exception.hpp create mode 100644 src/fenix_exception.cpp create mode 100644 test/exception_throw/CMakeLists.txt create mode 100644 test/exception_throw/fenix_exceptions.cpp diff --git a/include/fenix.hpp b/include/fenix.hpp index ff69498..7112c71 100644 --- a/include/fenix.hpp +++ b/include/fenix.hpp @@ -61,6 +61,7 @@ #include #include #include "fenix.h" +#include "fenix_exception.hpp" /** * @brief As the C-style callback, but accepts an std::function and does not use the void* pointer. @@ -71,4 +72,18 @@ */ int Fenix_Callback_register(std::function callback); +namespace Fenix { + +/** + * @brief Registers a callback that throws a CommException + * + * This means no longjmp will occur, and instead applications + * will continue from their try-catch error handler. + * + * @returnstatus + */ +int register_exception_callback(); + +} // namespace Fenix + #endif diff --git a/include/fenix_exception.hpp b/include/fenix_exception.hpp new file mode 100644 index 0000000..c2fc081 --- /dev/null +++ b/include/fenix_exception.hpp @@ -0,0 +1,74 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef FENIX_EXCEPTION_HPP +#define FENIX_EXCEPTION_HPP + +#include +#include + +namespace Fenix { + +struct CommException : public std::exception { + MPI_Comm repaired_comm; + const int fenix_err; + CommException(MPI_Comm comm, int err) : + repaired_comm(comm), fenix_err(err) { }; +}; + +} // namespace Fenix + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0256344..5a8b7b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*) set (Fenix_SOURCES fenix.cpp +fenix_exception.cpp fenix_opt.cpp fenix_process_recovery.cpp fenix_util.cpp diff --git a/src/fenix_exception.cpp b/src/fenix_exception.cpp new file mode 100644 index 0000000..6208243 --- /dev/null +++ b/src/fenix_exception.cpp @@ -0,0 +1,14 @@ +#include "fenix_exception.hpp" +#include "fenix.h" + +namespace Fenix { + +int register_exception_callback(){ + return Fenix_Callback_register( + [](MPI_Comm repaired_comm, int fen_err){ + throw CommException(repaired_comm, fen_err); + } + ); +} + +} // namespace Fenix diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4f2e92..ba6f65c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(request_cancelled) add_subdirectory(no_jump) add_subdirectory(issend) add_subdirectory(failed_spares) +add_subdirectory(exception_throw) diff --git a/test/exception_throw/CMakeLists.txt b/test/exception_throw/CMakeLists.txt new file mode 100644 index 0000000..7cd5a58 --- /dev/null +++ b/test/exception_throw/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# This file is part of Fenix +# Copyright (c) 2016 Rutgers University and Sandia Corporation. +# This software is distributed under the BSD License. +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# For more information, see the LICENSE file in the top Fenix +# directory. +# + +add_executable(fenix_exceptions fenix_exceptions.cpp) +target_link_libraries(fenix_exceptions fenix MPI::MPI_CXX) + +add_test(NAME exception_throw + COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 6 ${MPIEXEC_PREFLAGS} fenix_exceptions ${MPIEXEC_POSTFLAGS}) diff --git a/test/exception_throw/fenix_exceptions.cpp b/test/exception_throw/fenix_exceptions.cpp new file mode 100644 index 0000000..92fc9a0 --- /dev/null +++ b/test/exception_throw/fenix_exceptions.cpp @@ -0,0 +1,104 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + volatile int status = 0; + + MPI_Init(&argc, &argv); + + int fenix_role, error; + MPI_Comm res_comm; + MPI_Info info; + MPI_Info_create(&info); + MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); + MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP"); + Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error); + + Fenix::register_exception_callback(); + + if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){ + printf("FAILURE: longjmp instead of exception\n"); + status = 1; + } + + if (fenix_role == FENIX_ROLE_INITIAL_RANK) { + int rank; + MPI_Comm_rank(res_comm, &rank); + if(rank == 1) raise(SIGKILL); + + try { + MPI_Barrier(res_comm); + printf("FAILURE: barrier finished without fault\n"); + status = 1; + } catch (Fenix::CommException e){ + printf("SUCCESS: caught CommException\n"); + } + } + + Fenix_Finalize(); + MPI_Finalize(); + + return status; +}