2121#include " sst/core/stringize.h"
2222#include " sst/core/timeConverter.h"
2323
24- // #include <filesystem>
24+ #include < filesystem>
2525#include < sys/stat.h>
2626#include < unistd.h>
2727
2828namespace SST {
2929
30- CheckpointAction::CheckpointAction (
31- Config* UNUSED (cfg), RankInfo this_rank, Simulation_impl* sim, TimeConverter* period) :
30+ namespace pvt {
31+
32+ std::string
33+ createNameFromFormat (const std::string& format, const std::string& prefix, uint64_t checkpoint_id, SimTime_t time)
34+ {
35+ std::string ret;
36+ bool found_percent = false ;
37+ for ( const auto & x : format ) {
38+ if ( found_percent ) {
39+ switch ( x ) {
40+ case ' p' :
41+ ret += prefix;
42+ break ;
43+ case ' n' :
44+ ret += std::to_string (checkpoint_id);
45+ break ;
46+ case ' t' :
47+ ret += std::to_string (time);
48+ break ;
49+ default :
50+ // Should not happen since format string was already
51+ // checked, but if it does, just delete whole %
52+ // sequence (i.e. do nothing)
53+ break ;
54+ }
55+ found_percent = false ;
56+ }
57+ else if ( x == ' %' ) {
58+ found_percent = true ;
59+ }
60+ else {
61+ ret += x;
62+ }
63+ }
64+ return ret;
65+ }
66+
67+ } // namespace pvt
68+
69+ CheckpointAction::CheckpointAction (Config* cfg, RankInfo this_rank, Simulation_impl* sim, TimeConverter* period) :
3270 Action (),
3371 rank_ (this_rank),
3472 period_ (period),
@@ -53,7 +91,37 @@ CheckpointAction::CheckpointAction(
5391 next_sim_time_ = MAX_SIMTIME_T;
5492 }
5593
56- if ( (0 == this_rank.rank ) ) { last_cpu_time_ = sst_get_cpu_time (); }
94+ // Parse the format string. It was checked by the Config object
95+ // to make sure there was no more than one directory separator (/)
96+ // and that no invalid % sequences were used.
97+ std::string format = cfg->checkpoint_name_format ();
98+ size_t split = format.find (" /" );
99+ if ( split == format.npos ) {
100+ dir_format_ = format;
101+ file_format_ = format;
102+ }
103+ else {
104+ dir_format_ = format.substr (0 , split);
105+ file_format_ = format.substr (split + 1 );
106+ }
107+
108+ if ( (0 == this_rank.rank ) ) {
109+ // Check to make sure that there is at least one of %n or %t to
110+ // make checkpoint filenames unique.
111+ bool unique = false ;
112+ if ( format.find (" %n" ) != format.npos ) unique = true ;
113+ if ( format.find (" %t" ) != format.npos ) unique = true ;
114+
115+ if ( !unique ) {
116+ sim->getSimulationOutput ().output (
117+ " WARNING: checkpoint-name-format does not include one of %%n or %%t, which means that all checkpoints "
118+ " will use the same filename and previous files will be overwritten [%s].\n " ,
119+ format.c_str ());
120+ }
121+
122+ last_cpu_time_ = sst_get_cpu_time ();
123+ }
124+
57125 // Set the priority to be the same as the SyncManager so that
58126 // checkpointing happens in the same place for both serial and
59127 // parallel runs. We will never have both a SyncManager and a
@@ -78,7 +146,7 @@ CheckpointAction::execute()
78146void
79147CheckpointAction::createCheckpoint (Simulation_impl* sim)
80148{
81- if ( 0 == rank_.rank ) {
149+ if ( 0 == rank_.rank && 0 == rank_. thread ) {
82150 const double now = sst_get_cpu_time ();
83151 sim->getSimulationOutput ().output (
84152 " # Simulation Checkpoint: Simulated Time %s (Real CPU time since last checkpoint %.5f seconds)\n " ,
@@ -89,36 +157,31 @@ CheckpointAction::createCheckpoint(Simulation_impl* sim)
89157
90158 // Need to create a directory for this checkpoint
91159 std::string prefix = sim->checkpoint_prefix_ ;
92- std::string basename = prefix + " _ " + std::to_string (checkpoint_id) + " _ " + std::to_string ( sim->currentSimCycle );
160+ std::string basename = pvt::createNameFromFormat (dir_format_, prefix, checkpoint_id, sim->currentSimCycle );
93161
94162 // Directory is shared across threads. Make it a static and make
95163 // sure we barrier in the right places
96- static std::string directory;
164+ std::string directory = sim-> checkpoint_directory_ + " / " + basename ;
97165
98166 // Only thread 0 will participate in setup
99167 if ( rank_.thread == 0 ) {
100168 // Rank 0 will create the directory for this checkpoint
101169 if ( rank_.rank == 0 ) {
102- directory = Checkpointing::createUniqueDirectory (sim->checkpoint_directory_ + " /" + basename);
103- #ifdef SST_CONFIG_HAVE_MPI
104- Comms::broadcast (directory, 0 );
105- #endif
170+ directory = sim->checkpoint_directory_ + " /" + basename;
171+ std::filesystem::create_directory (directory);
106172 }
107- else {
108- // Get directory name (really just a barrier since each
109- // rank already knows the name and it shouldn't have to
110- // create a unique one)
111173#ifdef SST_CONFIG_HAVE_MPI
112- Comms::broadcast (directory, 0 );
174+ Comms::broadcast (directory, 0 );
113175#endif
114- }
115176 }
116- barrier.wait ();
117- if ( rank_.thread == 0 ) checkpoint_id++;
118-
177+ basename = pvt::createNameFromFormat (file_format_, prefix, checkpoint_id, sim->currentSimCycle );
119178 std::string filename =
120179 directory + " /" + basename + " _" + std::to_string (rank_.rank ) + " _" + std::to_string (rank_.thread ) + " .bin" ;
121180
181+ barrier.wait ();
182+
183+ if ( rank_.thread == 0 ) checkpoint_id++;
184+
122185 // Write out the checkpoints for the partitions
123186 sim->checkpoint (filename);
124187
@@ -245,55 +308,6 @@ doesDirectoryExist(const std::string& dirName, bool include_files)
245308 }
246309}
247310
248- /* *
249- Function to create a directory. We need this bacause
250- std::filesystem isn't fully supported until GCC9
251- */
252- bool
253- createDirectory (const std::string& dirName)
254- {
255- if ( mkdir (dirName.c_str (), 0755 ) == 0 ) {
256- return true ; // Directory created successfully
257- }
258- else {
259- return false ; // Failed to create directory
260- }
261- }
262-
263- std::string
264- createUniqueDirectory (const std::string basename)
265- {
266- std::string dirName = basename;
267-
268- // Check if the directory exists
269- // if ( std::filesystem::exists(dirName) ) {
270- if ( doesDirectoryExist (dirName, true ) ) {
271- // Append a unique random set of characters to the directory name
272- std::string newDirName;
273- int num = 0 ;
274- do {
275- ++num;
276- newDirName = dirName + " _" + std::to_string (num);
277- // } while ( std::filesystem::exists(newDirName) ); // Ensure the new directory name is unique
278- } while ( doesDirectoryExist (newDirName, true ) ); // Ensure the new directory name is unique
279-
280- dirName = newDirName;
281- }
282-
283- // Create the directory
284- // if ( !std::filesystem::create_directory(dirName) ) {
285- if ( !createDirectory (dirName) ) {
286- Simulation_impl::getSimulationOutput ().fatal (
287- CALL_INFO_LONG, 1 , " Failed to create directory: %s\n " , dirName.c_str ());
288- }
289- return dirName;
290- }
291-
292- void
293- removeDirectory (const std::string UNUSED (name))
294- {
295- // Implement when adding logic to keep only N checkpoints
296- }
297311
298312std::string
299313initializeCheckpointInfrastructure (Config* cfg, bool rt_can_ckpt, int myRank)
@@ -303,7 +317,11 @@ initializeCheckpointInfrastructure(Config* cfg, bool rt_can_ckpt, int myRank)
303317
304318 std::string checkpoint_dir_name = " " ;
305319
306- if ( myRank == 0 ) { checkpoint_dir_name = createUniqueDirectory (cfg->checkpoint_prefix ()); }
320+ if ( myRank == 0 ) {
321+ SST::Util::Filesystem& fs = Simulation_impl::getSimulation ()->filesystem ;
322+ checkpoint_dir_name = fs.createUniqueDirectory (cfg->checkpoint_prefix ());
323+ }
324+
307325#ifdef SST_CONFIG_HAVE_MPI
308326 // Broadcast the directory name
309327 Comms::broadcast (checkpoint_dir_name, 0 );
0 commit comments