Skip to content

Commit ab2f2de

Browse files
committed
Fix checkpoint action for multithreaded execution
1 parent c1667d5 commit ab2f2de

File tree

6 files changed

+49
-2
lines changed

6 files changed

+49
-2
lines changed

src/sst/core/checkpointAction.cc

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ CheckpointAction::execute()
154154
void
155155
CheckpointAction::createCheckpoint(Simulation_impl* sim)
156156
{
157+
157158
if ( 0 == rank_.rank && 0 == rank_.thread ) {
158159
const double now = sst_get_cpu_time();
159160
sim->getSimulationOutput().output(
@@ -212,9 +213,11 @@ CheckpointAction::createCheckpoint(Simulation_impl* sim)
212213
// No need to barrier here since rank 0 thread 0 will be the first
213214
// to execute in the loop below and everything else will wait
214215
for ( uint32_t r = 0; r < num_ranks.rank; ++r ) {
216+
215217
if ( r == rank_.rank ) {
216218
// If this is my rank go ahead
217219
for ( uint32_t t = 0; t < num_ranks.thread; ++t ) {
220+
218221
// If this is my thread go ahead
219222
if ( t == rank_.thread ) {
220223
sim->checkpoint_append_registry(directory + "/" + registry_name, filename);
@@ -253,13 +256,18 @@ CheckpointAction::createCheckpoint(Simulation_impl* sim)
253256
// SyncManager check whether a checkpoint needs to be generated
254257
SimTime_t
255258
CheckpointAction::check(SimTime_t current_time)
256-
{
259+
{
260+
#if 0
261+
Simulation_impl* sim = Simulation_impl::getSimulation();
262+
sim->getSimulationOutput().output(
263+
"skk:T %d: checkpointAction.cc: check()\n", rank_.thread);
264+
#endif
257265
// The if-logic is a little weird, but it's trying to minimize the
258266
// number of branches in the normal case of no checkpoint being
259267
// initiated. This will also handle the case where both a sim and
260268
// real-time trigger happened at the same time
261269
if ( (current_time == next_sim_time_) || generate_ ) {
262-
Simulation_impl* sim = Simulation_impl::getSimulation();
270+
Simulation_impl* sim = Simulation_impl::getSimulation();
263271
createCheckpoint(sim);
264272
generate_ = false;
265273
// Only add to the simulation-interval checkpoint time if it
@@ -271,6 +279,11 @@ CheckpointAction::check(SimTime_t current_time)
271279
return next_sim_time_;
272280
}
273281

282+
bool
283+
CheckpointAction::getCheckpoint()
284+
{
285+
return generate_;
286+
}
274287
void
275288
CheckpointAction::setCheckpoint()
276289
{

src/sst/core/checkpointAction.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ class CheckpointAction : public Action
7676
*/
7777
void insertIntoTimeVortex(Simulation_impl* sim);
7878

79+
/** Get checkpoint flag */
80+
bool getCheckpoint();
81+
7982
/** Generate a checkpoint next time check() is called */
8083
void setCheckpoint();
8184

src/sst/core/realtime.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,12 @@ RealTimeManager::notifySignal()
635635
}
636636

637637
if ( sig_alrm_from_os_ ) {
638+
#if 0
639+
Output sim_output = Simulation_impl::getSimulation()->getSimulationOutput();
640+
RankInfo rank = Simulation_impl::getSimulation()->getRank();
641+
sim_output.output(
642+
"skk:T%d: realtime: sig_alrm_from_os\n", rank.thread);
643+
#endif
638644
sig_alrm_from_os_ = 0;
639645
if ( serial_exec_ ) {
640646
signal_actions_[SIGALRM]->execute();

src/sst/core/simulation.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,6 +1715,10 @@ Simulation_impl::getComponentObjectMap()
17151715
void
17161716
Simulation_impl::scheduleCheckpoint()
17171717
{
1718+
#if 0
1719+
sim_output.output(
1720+
"skk: simulation: scheduleCheckpoint\n");
1721+
#endif
17181722
checkpoint_action_->setCheckpoint();
17191723

17201724
// Trigger checkpoint immediately in serial simulations

src/sst/core/sync/syncManager.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,9 +618,28 @@ SyncManager::execute()
618618
real_time_->performSignal(sig_end);
619619
else if ( signals_received ) {
620620
if ( sig_usr ) real_time_->performSignal(sig_usr);
621+
#if 0
621622
if ( sig_alrm ) real_time_->performSignal(sig_alrm);
623+
#else
624+
if (sig_alrm) {
625+
//out.output("skk:syncmgr:execute: T%d: in sigalrm\n", rank_.thread);
626+
real_time_->performSignal(sig_alrm);
627+
}
628+
#endif
629+
}
630+
631+
// Check local checkpoint generate flag and set shared generate if needed.
632+
if (checkpoint_->getCheckpoint() == true) {
633+
ckpt_generate_.store(1);
634+
}
635+
// Ensure everyone has written the mask before updating local generate_
636+
ic_barrier_.wait();
637+
if (ckpt_generate_.load()) {
638+
checkpoint_->setCheckpoint();
622639
}
623640
next_checkpoint_time = checkpoint_->check(getDeliveryTime());
641+
ckpt_generate_.store(0);
642+
624643

625644
handleShutdown(); // Check if any thread set shutdown
626645
handleInteractiveConsole(); // Check of any thread set interactive console
@@ -730,6 +749,7 @@ SyncManager::addProfileTool(Profile::SyncProfileTool* tool)
730749
profile_tools_->addProfileTool(tool);
731750
}
732751

752+
std::atomic<unsigned> SyncManager::ckpt_generate_ { 0 };
733753
std::atomic<unsigned> SyncManager::enter_interactive_mask_ { 0 };
734754
std::atomic<int> SyncManager::current_ic_thread_ { 0 };
735755
std::atomic<int> SyncManager::current_ic_state_ { 0 };

src/sst/core/sync/syncManager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ class SyncManager : public Action
190190

191191
RealTimeManager* real_time_;
192192
CheckpointAction* checkpoint_;
193+
static std::atomic<unsigned> ckpt_generate_;
193194
static std::atomic<unsigned> enter_interactive_mask_;
194195
static std::atomic<int> current_ic_thread_;
195196
static std::atomic<int> current_ic_state_;

0 commit comments

Comments
 (0)