Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
80abeeb
Initial conversion to C++
Matthew-Whitlock Mar 28, 2024
c105f57
Reorganize to allow c++ in internal headers
Matthew-Whitlock Mar 4, 2025
f22f3e0
Support c++ lambda callbacks
Matthew-Whitlock Apr 25, 2025
f7fcb39
Add Fenix::CommException
Matthew-Whitlock Apr 25, 2025
d51417e
Support data member inquiry functions
Matthew-Whitlock Apr 25, 2025
945ee68
Expand Fenix config options
Matthew-Whitlock Apr 25, 2025
8051e09
Update data subsets and policies to c++, implement storev and resizea…
Matthew-Whitlock Jun 4, 2025
f2a5274
Fix assertion for 1-rank test case
Matthew-Whitlock Jun 26, 2025
18c7fad
Use correct API in example
Matthew-Whitlock Jul 8, 2025
8d502b5
Rename some options for better encapsulation, add minor conveniences
Matthew-Whitlock Jul 8, 2025
b6b84f4
Expand c++ API parity, dedup
Matthew-Whitlock Jul 8, 2025
75ceff0
New basic convenience functions
Matthew-Whitlock Jul 8, 2025
9716e22
Small bugfix
Matthew-Whitlock Jul 8, 2025
d269266
Allow user to invoke callbacks, better define callback behavior when …
Matthew-Whitlock Jul 8, 2025
754192a
Unconditionally reorder processes
Matthew-Whitlock Jul 8, 2025
50f0a3f
Bugfix
Matthew-Whitlock Jul 15, 2025
d58f1d5
Fix issues with FENIX_ constants
Matthew-Whitlock Oct 16, 2025
1991c95
Small bugfixes
Matthew-Whitlock Oct 16, 2025
11886e5
Add pre-recovery callbacks
Matthew-Whitlock Oct 16, 2025
f22c88d
Revoke internal comms for IMR policy
Matthew-Whitlock Oct 16, 2025
5e2f1bc
Support building on mpich
Matthew-Whitlock Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ test/request_tracking/fenix_request_tracking_test
test/request_tracking/fenix_request_tracking_test_nofenix
build/
install/
spack-*

# Other
*~
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

cmake_minimum_required(VERSION 3.10.2)

project(Fenix C)
project(Fenix C CXX)
# The version number.
set(FENIX_VERSION_MAJOR 1)
set(FENIX_VERSION_MINOR 0)
Expand Down
1 change: 1 addition & 0 deletions examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

add_executable(fenix_hello_world fenix_hello_world.c)
target_link_libraries(fenix_hello_world fenix ${MPI_C_LIBRARIES})
set_target_properties(fenix_hello_world PROPERTIES LINKER_LANGUAGE C)

if(BUILD_TESTING)
add_test(NAME hello_world
Expand Down
1 change: 1 addition & 0 deletions examples/01_hello_world/fenix/fenix_hello_world.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include <mpi.h>
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>

Expand Down
7 changes: 4 additions & 3 deletions examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
#

add_executable(fenix_ring fenix_ring.c)
target_link_libraries(fenix_ring fenix ${MPI_C_LIBRARIES} m )
target_link_libraries(fenix_ring fenix ${MPI_C_LIBRARIES})
set_target_properties(fenix_ring PROPERTIES LINKER_LANGUAGE C)

if(BUILD_TESTING)
add_test(NAME ring
add_test(NAME send_recv
COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_ring ${MPIEXEC_POSTFLAGS} 1 2)
set_tests_properties(ring PROPERTIES
set_tests_properties(send_recv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
36 changes: 22 additions & 14 deletions examples/05_subset_create/subset_create.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,12 @@
int max_iter = 2;
const int kCount = 100;
const int kKillID = 2;
const int my_group = 0;
const int my_member = 0;

int main(int argc, char **argv) {
fprintf(stderr, "Started\n");
int i;
int subset[500];
int subset[kCount];
MPI_Status status;

if (argc < 2) {
Expand All @@ -86,7 +87,6 @@ fprintf(stderr, "Started\n");
int num_ranks;
int rank;
int error;
int my_group = 0;
int my_timestamp = 0;
int my_depth = 1;
int recovered = 0;
Expand Down Expand Up @@ -120,24 +120,33 @@ fprintf(stderr, "Started\n");

if (fenix_role == FENIX_ROLE_INITIAL_RANK) {
// init my subset data
int index;
for (index = 0; index < kCount; index++) {
for (int index = 0; index < kCount; index++) {
subset[index] = -1;
}

Fenix_Data_member_create(my_group, 777, subset, kCount, MPI_INT);
Fenix_Data_member_create(my_group, my_member, subset, kCount, MPI_INT);

//Store the entire data set for the initial commit. This is not a requirement.
Fenix_Data_member_store(my_group, 777, FENIX_DATA_SUBSET_FULL);
Fenix_Data_member_store(my_group, my_member, FENIX_DATA_SUBSET_FULL);
Fenix_Data_commit_barrier(my_group, NULL);

} else {
//We've had a failure! Time to recover data.
fprintf(stderr, "Starting data recovery on node %d\n", rank);
Fenix_Data_member_restore(my_group, 777, subset, kCount, FENIX_TIME_STAMP_MAX, NULL);
fprintf(stderr, "Starting data recovery on rank %d\n", rank);

//Set all data to a value that was never stored
for (int index = 0; index < kCount; index++) {
subset[index] = -2;
}

int restore_ret = Fenix_Data_member_restore(my_group, my_member, subset, kCount, FENIX_DATA_SNAPSHOT_LATEST, NULL);

if(restore_ret != FENIX_SUCCESS){
fprintf(stderr, "Rank %d restore failure w/ code %d\n", rank, restore_ret);
}

int out_flag;
Fenix_Data_member_attr_set(my_group, 777, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER,
Fenix_Data_member_attr_set(my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER,
subset, &out_flag);


Expand All @@ -159,20 +168,19 @@ fprintf(stderr, "Started\n");
//We'll store only the small subset that we specified, though.
//This means that as far as Fenix is concerned only data within that
//subset was ever changed from the initialized value of -1
Fenix_Data_member_store(my_group, 777, subset_specifier);
Fenix_Data_member_store(my_group, my_member, subset_specifier);
Fenix_Data_commit_barrier(my_group, NULL);

MPI_Barrier(new_comm); //Make sure everyone is done committing before we kill and restart everyone
//else we may end up with only some nodes having the commit, and it being unusable

}


//Kill a rank to test that we can recover from the commits we've made.
if (rank == kKillID && recovered == 0) {
fprintf(stderr, "Doing kill on node %d\n", rank);
pid_t pid = getpid();
kill(pid, SIGTERM);
kill(pid, SIGKILL);
}

//Make sure we've let rank 2 fail before proceeding, so we're definitely checking
Expand Down Expand Up @@ -214,6 +222,6 @@ fprintf(stderr, "Started\n");


Fenix_Finalize();
MPI_Finalize();
//MPI_Finalize();
return !successful; //return error status
}
21 changes: 21 additions & 0 deletions examples/07_resizeable_member/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#
# This file is part of Fenix
# Copyright (c) 2016 Rutgers University and Sandia Corporation.
# This software is distributed under the BSD License.
# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
# the U.S. Government retains certain rights in this software.
# For more information, see the LICENSE file in the top Fenix
# directory.
#

add_executable(resizeable resizeable.cpp)
target_link_libraries(resizeable fenix ${MPI_C_LIBRARIES})

target_compile_features(resizeable PRIVATE cxx_std_20)

if(BUILD_TESTING)
add_test(NAME resizeable
COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} resizeable ${MPIEXEC_POSTFLAGS} 1)
set_tests_properties(resizeable PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE" LABELS "Example")
endif()
203 changes: 203 additions & 0 deletions examples/07_resizeable_member/resizeable.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
//@HEADER
// ************************************************************************
//
//
// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
// _| _| _|_| _| _| _| _|
// _|_|_| _|_|_| _| _| _| _| _|
// _| _| _| _|_| _| _| _|
// _| _|_|_|_| _| _| _|_|_| _| _|
//
//
//
//
// Copyright (C) 2016 Rutgers University and Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY RUTGERS UNIVERSITY and SANDIA CORPORATION
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RUTGERS
// UNIVERISY, SANDIA CORPORATION OR THE CONTRIBUTORS BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
// IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
// IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
//
// ************************************************************************
//@HEADER
*/

#include <fenix.hpp>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/types.h>
#include <unistd.h>
#include <vector>

constexpr int kKillID = 2;
constexpr int my_group = 0;
constexpr int my_member = 0;
constexpr int start_timestamp = 0;
constexpr int group_depth = 1;
int errflag;

using Fenix::DataSubset;
using namespace Fenix::Data;

int main(int argc, char **argv) {
MPI_Init(&argc, &argv);

MPI_Comm res_comm;
Fenix::init({.out_comm = &res_comm, .spares = 1});

int num_ranks, rank;
MPI_Comm_size(res_comm, &num_ranks);
MPI_Comm_rank(res_comm, &rank);

std::vector<int> data;

bool should_throw = Fenix_get_role() == FENIX_ROLE_RECOVERED_RANK;
while(true) try {
if(should_throw){
should_throw = false;
Fenix::throw_exception();
}

//Initial work and commits
if(Fenix_get_role() == FENIX_ROLE_INITIAL_RANK){
Fenix_Data_group_create(
my_group, res_comm, start_timestamp, group_depth, FENIX_DATA_POLICY_IMR,
NULL, &errflag
);
Fenix_Data_member_create(
my_group, my_member, data.data(), FENIX_RESIZEABLE, MPI_INT
);

data.resize(100);
for(int& i : data) i = -1;


//Store the whole array first. We need to keep our buffer pointer updated
//since resizing an array can change it
Fenix_Data_member_attr_set(
my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, data.data(),
&errflag
);
member_store(my_group, my_member, {{0, data.size()-1}});
Fenix_Data_commit_barrier(my_group, NULL);


//Now commit a smaller portion with different data.
data.resize(50);
int val = 1;
for(int& i : data) i = val++;

Fenix_Data_member_attr_set(
my_group, my_member, FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER, data.data(),
&errflag
);
member_store(my_group, my_member, {{0, data.size()-1}});
Fenix_Data_commit_barrier(my_group, NULL);


if(rank == kKillID){
fprintf(stderr, "Doing kill on node %d\n", rank);
raise(SIGTERM);
}
}

Fenix_Finalize();


break;
} catch (const Fenix::CommException& e) {
const Fenix::CommException* err = &e;
while(true) try {
//We've had a failure! Time to recover data.
fprintf(stderr, "Starting data recovery on rank %d\n", rank);
if(err->fenix_err != FENIX_SUCCESS){
fprintf(stderr, "FAILURE on Fenix Init (%d). Exiting.\n", err->fenix_err);
exit(1);
}

Fenix_Data_group_create(
my_group, res_comm, start_timestamp, group_depth, FENIX_DATA_POLICY_IMR,
NULL, &errflag
);

//Do a null restore to get information about the stored subset
DataSubset stored_subset;
int ret = member_restore(
my_group, my_member, nullptr, 0, FENIX_DATA_SNAPSHOT_LATEST, stored_subset
);
if(ret != FENIX_SUCCESS) {
fprintf(stderr, "Rank %d restore failure w/ code %d\n", rank, ret);
MPI_Abort(MPI_COMM_WORLD, 1);
}

//Resize data to fit all stored data
data.resize(stored_subset.max_count());

//Set all data to a value that was never stored, just for testing
for(int& i : data) i = -2;

//Now do an lrestore to get the recovered data.
ret = member_lrestore(
my_group, my_member, data.data(), data.size(), FENIX_DATA_SNAPSHOT_LATEST,
stored_subset
);

break;
} catch (const Fenix::CommException& nested){
err = &nested;
}
}

//Ensure data is correct after execution and recovery
bool successful = data.size() == 50;
if(!successful) printf("Rank %d expected data size 50, but got %d\n", rank, data.size());

for(int i = 0; i < data.size() && successful; i++){
successful &= data[i] == i+1;
if(!successful) printf("Rank %d data[%d]=%d, but should be %d!\n", rank, i, data[i], i+1);
}

if(successful){
printf("Rank %d successfully recovered\n", rank);
} else {
printf("FAILURE on rank %d\n", rank);
}

MPI_Finalize();
return !successful; //return error status
}
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_subdirectory(03_reduce/fenix)
add_subdirectory(04_Isend_Irecv/fenix)
add_subdirectory(05_subset_create)
add_subdirectory(06_subset_createv)
add_subdirectory(07_resizeable_member)
Loading
Loading