Skip to content

Commit 130dd9b

Browse files
committed
Merge remote-tracking branch 'sstcore/devel' into make_unique_vla
2 parents cbee348 + 6488a8c commit 130dd9b

35 files changed

+953
-251
lines changed

config/sst_check_filesystem.m4

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
AC_DEFUN([SST_CHECK_FILESYSTEM],
2+
[
3+
AC_LANG_SAVE
4+
AC_LANG([C++])
5+
# In older versions of gcc implementation of std::filesystem is kept
6+
# in separate stdc++fs library. You should link it explicitly
7+
AC_MSG_CHECKING([if std::filesystem requires linking stdc++fs])
8+
AC_LINK_IFELSE(
9+
[AC_LANG_SOURCE([
10+
#include <filesystem>
11+
int main() {
12+
std::filesystem::create_directory("/dev/null");
13+
}
14+
])],
15+
[ac_cv_fs_stdlib=no],
16+
[ac_cv_fs_stdlib=yes]
17+
)
18+
if test "x$ac_cv_fs_stdlib" = xyes; then
19+
AC_MSG_RESULT(yes)
20+
LIBS="$LIBS -lstdc++fs"
21+
else
22+
AC_MSG_RESULT(no)
23+
fi
24+
AC_LANG_RESTORE
25+
26+
])

configure.ac

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ AM_INIT_AUTOMAKE([1.9.6 foreign dist-bzip2 subdir-objects no-define tar-pax])
1313
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
1414
m4_pattern_allow([LT_LIBEXT])
1515

16+
AH_TOP([
17+
#ifndef _SST_CONFIG_H_
18+
#define _SST_CONFIG_H_
19+
])
20+
AH_BOTTOM([
21+
#endif /* _SST_CONFIG_H_ */
22+
])
1623
AC_CONFIG_HEADERS([src/sst/core/sst_config.h])
1724

1825
# Lets check for the standard compilers and basic options
@@ -93,6 +100,8 @@ SST_ENABLE_CORE_PROFILE()
93100

94101
SST_CHECK_FPIC()
95102

103+
SST_CHECK_FILESYSTEM()
104+
96105
AC_DEFINE_UNQUOTED([SST_CPPFLAGS], ["$CPPFLAGS"], [Defines the CPPFLAGS used to build SST])
97106
AC_DEFINE_UNQUOTED([SST_CFLAGS], ["$CFLAGS"], [Defines the CFLAGS used to build SST])
98107
AC_DEFINE_UNQUOTED([SST_CXXFLAGS], ["$CXXFLAGS"], [Defines the CXXFLAGS used to build SST])

src/sst/core/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ sst_core_sources = \
177177
configBase.cc \
178178
configShared.cc \
179179
configGraph.cc \
180+
configGraphOutput.cc \
180181
cfgoutput/pythonConfigOutput.cc \
181182
cfgoutput/dotConfigOutput.cc \
182183
cfgoutput/xmlConfigOutput.cc \

src/sst/core/checkpointAction.cc

Lines changed: 88 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,52 @@
2121
#include "sst/core/stringize.h"
2222
#include "sst/core/timeConverter.h"
2323

24-
// #include <filesystem>
24+
#include <filesystem>
2525
#include <sys/stat.h>
2626
#include <unistd.h>
2727

2828
namespace SST {
2929

30-
CheckpointAction::CheckpointAction(
31-
Config* UNUSED(cfg), RankInfo this_rank, Simulation_impl* sim, TimeConverter* period) :
30+
namespace pvt {
31+
32+
std::string
33+
createNameFromFormat(const std::string& format, const std::string& prefix, uint64_t checkpoint_id, SimTime_t time)
34+
{
35+
std::string ret;
36+
bool found_percent = false;
37+
for ( const auto& x : format ) {
38+
if ( found_percent ) {
39+
switch ( x ) {
40+
case 'p':
41+
ret += prefix;
42+
break;
43+
case 'n':
44+
ret += std::to_string(checkpoint_id);
45+
break;
46+
case 't':
47+
ret += std::to_string(time);
48+
break;
49+
default:
50+
// Should not happen since format string was already
51+
// checked, but if it does, just delete whole %
52+
// sequence (i.e. do nothing)
53+
break;
54+
}
55+
found_percent = false;
56+
}
57+
else if ( x == '%' ) {
58+
found_percent = true;
59+
}
60+
else {
61+
ret += x;
62+
}
63+
}
64+
return ret;
65+
}
66+
67+
} // namespace pvt
68+
69+
CheckpointAction::CheckpointAction(Config* cfg, RankInfo this_rank, Simulation_impl* sim, TimeConverter* period) :
3270
Action(),
3371
rank_(this_rank),
3472
period_(period),
@@ -53,7 +91,37 @@ CheckpointAction::CheckpointAction(
5391
next_sim_time_ = MAX_SIMTIME_T;
5492
}
5593

56-
if ( (0 == this_rank.rank) ) { last_cpu_time_ = sst_get_cpu_time(); }
94+
// Parse the format string. It was checked by the Config object
95+
// to make sure there was no more than one directory separator (/)
96+
// and that no invalid % sequences were used.
97+
std::string format = cfg->checkpoint_name_format();
98+
size_t split = format.find("/");
99+
if ( split == format.npos ) {
100+
dir_format_ = format;
101+
file_format_ = format;
102+
}
103+
else {
104+
dir_format_ = format.substr(0, split);
105+
file_format_ = format.substr(split + 1);
106+
}
107+
108+
if ( (0 == this_rank.rank) ) {
109+
// Check to make sure that there is at least one of %n or %t to
110+
// make checkpoint filenames unique.
111+
bool unique = false;
112+
if ( format.find("%n") != format.npos ) unique = true;
113+
if ( format.find("%t") != format.npos ) unique = true;
114+
115+
if ( !unique ) {
116+
sim->getSimulationOutput().output(
117+
"WARNING: checkpoint-name-format does not include one of %%n or %%t, which means that all checkpoints "
118+
"will use the same filename and previous files will be overwritten [%s].\n",
119+
format.c_str());
120+
}
121+
122+
last_cpu_time_ = sst_get_cpu_time();
123+
}
124+
57125
// Set the priority to be the same as the SyncManager so that
58126
// checkpointing happens in the same place for both serial and
59127
// parallel runs. We will never have both a SyncManager and a
@@ -78,7 +146,7 @@ CheckpointAction::execute()
78146
void
79147
CheckpointAction::createCheckpoint(Simulation_impl* sim)
80148
{
81-
if ( 0 == rank_.rank ) {
149+
if ( 0 == rank_.rank && 0 == rank_.thread ) {
82150
const double now = sst_get_cpu_time();
83151
sim->getSimulationOutput().output(
84152
"# Simulation Checkpoint: Simulated Time %s (Real CPU time since last checkpoint %.5f seconds)\n",
@@ -89,36 +157,31 @@ CheckpointAction::createCheckpoint(Simulation_impl* sim)
89157

90158
// Need to create a directory for this checkpoint
91159
std::string prefix = sim->checkpoint_prefix_;
92-
std::string basename = prefix + "_" + std::to_string(checkpoint_id) + "_" + std::to_string(sim->currentSimCycle);
160+
std::string basename = pvt::createNameFromFormat(dir_format_, prefix, checkpoint_id, sim->currentSimCycle);
93161

94162
// Directory is shared across threads. Make it a static and make
95163
// sure we barrier in the right places
96-
static std::string directory;
164+
std::string directory = sim->checkpoint_directory_ + "/" + basename;
97165

98166
// Only thread 0 will participate in setup
99167
if ( rank_.thread == 0 ) {
100168
// Rank 0 will create the directory for this checkpoint
101169
if ( rank_.rank == 0 ) {
102-
directory = Checkpointing::createUniqueDirectory(sim->checkpoint_directory_ + "/" + basename);
103-
#ifdef SST_CONFIG_HAVE_MPI
104-
Comms::broadcast(directory, 0);
105-
#endif
170+
directory = sim->checkpoint_directory_ + "/" + basename;
171+
std::filesystem::create_directory(directory);
106172
}
107-
else {
108-
// Get directory name (really just a barrier since each
109-
// rank already knows the name and it shouldn't have to
110-
// create a unique one)
111173
#ifdef SST_CONFIG_HAVE_MPI
112-
Comms::broadcast(directory, 0);
174+
Comms::broadcast(directory, 0);
113175
#endif
114-
}
115176
}
116-
barrier.wait();
117-
if ( rank_.thread == 0 ) checkpoint_id++;
118-
177+
basename = pvt::createNameFromFormat(file_format_, prefix, checkpoint_id, sim->currentSimCycle);
119178
std::string filename =
120179
directory + "/" + basename + "_" + std::to_string(rank_.rank) + "_" + std::to_string(rank_.thread) + ".bin";
121180

181+
barrier.wait();
182+
183+
if ( rank_.thread == 0 ) checkpoint_id++;
184+
122185
// Write out the checkpoints for the partitions
123186
sim->checkpoint(filename);
124187

@@ -245,55 +308,6 @@ doesDirectoryExist(const std::string& dirName, bool include_files)
245308
}
246309
}
247310

248-
/**
249-
Function to create a directory. We need this bacause
250-
std::filesystem isn't fully supported until GCC9
251-
*/
252-
bool
253-
createDirectory(const std::string& dirName)
254-
{
255-
if ( mkdir(dirName.c_str(), 0755) == 0 ) {
256-
return true; // Directory created successfully
257-
}
258-
else {
259-
return false; // Failed to create directory
260-
}
261-
}
262-
263-
std::string
264-
createUniqueDirectory(const std::string basename)
265-
{
266-
std::string dirName = basename;
267-
268-
// Check if the directory exists
269-
// if ( std::filesystem::exists(dirName) ) {
270-
if ( doesDirectoryExist(dirName, true) ) {
271-
// Append a unique random set of characters to the directory name
272-
std::string newDirName;
273-
int num = 0;
274-
do {
275-
++num;
276-
newDirName = dirName + "_" + std::to_string(num);
277-
// } while ( std::filesystem::exists(newDirName) ); // Ensure the new directory name is unique
278-
} while ( doesDirectoryExist(newDirName, true) ); // Ensure the new directory name is unique
279-
280-
dirName = newDirName;
281-
}
282-
283-
// Create the directory
284-
// if ( !std::filesystem::create_directory(dirName) ) {
285-
if ( !createDirectory(dirName) ) {
286-
Simulation_impl::getSimulationOutput().fatal(
287-
CALL_INFO_LONG, 1, "Failed to create directory: %s\n", dirName.c_str());
288-
}
289-
return dirName;
290-
}
291-
292-
void
293-
removeDirectory(const std::string UNUSED(name))
294-
{
295-
// Implement when adding logic to keep only N checkpoints
296-
}
297311

298312
std::string
299313
initializeCheckpointInfrastructure(Config* cfg, bool rt_can_ckpt, int myRank)
@@ -303,7 +317,11 @@ initializeCheckpointInfrastructure(Config* cfg, bool rt_can_ckpt, int myRank)
303317

304318
std::string checkpoint_dir_name = "";
305319

306-
if ( myRank == 0 ) { checkpoint_dir_name = createUniqueDirectory(cfg->checkpoint_prefix()); }
320+
if ( myRank == 0 ) {
321+
SST::Util::Filesystem& fs = Simulation_impl::getSimulation()->filesystem;
322+
checkpoint_dir_name = fs.createUniqueDirectory(cfg->checkpoint_prefix());
323+
}
324+
307325
#ifdef SST_CONFIG_HAVE_MPI
308326
// Broadcast the directory name
309327
Comms::broadcast(checkpoint_dir_name, 0);

src/sst/core/checkpointAction.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ class CheckpointAction : public Action
9494
double last_cpu_time_; // Last time a checkpoint was triggered
9595
bool generate_; // Whether a checkpoint should be done next time check() is called
9696
SimTime_t next_sim_time_; // Next simulationt ime a checkpoint should trigger at or 0 if not applicable
97+
std::string dir_format_; // Format string for checkpoint directory names
98+
std::string file_format_; // Format string for checkpoint file names
9799
};
98100

99101
} // namespace SST

0 commit comments

Comments
 (0)