@@ -126,6 +126,16 @@ class SimulationHelper : public IPlugin
126126 /* trigger checkpoint notification */
127127 if (checkpointPeriod && (currentStep % checkpointPeriod == 0 ))
128128 {
129+ /* first synchronize: if something failed, we can spare the time
130+ * for the checkpoint writing */
131+ CUDA_CHECK (cudaDeviceSynchronize ());
132+ CUDA_CHECK (cudaGetLastError ());
133+
134+ GridController<DIM> &gc = Environment<DIM>::get ().GridController ();
135+ /* can be spared for better scalings, but allows to spare the
136+ * time for checkpointing if some ranks died */
137+ MPI_CHECK (MPI_Barrier (gc.getCommunicator ().getMPIComm ()));
138+
129139 /* create directory containing checkpoints */
130140 if (numCheckpoints == 0 )
131141 {
@@ -135,7 +145,14 @@ class SimulationHelper : public IPlugin
135145 Environment<DIM>::get ().PluginConnector ().checkpointPlugins (currentStep,
136146 checkpointDirectory);
137147
138- GridController<DIM> &gc = Environment<DIM>::get ().GridController ();
148+ /* important synchronize: only if no errors occured until this
149+ * point guarantees that a checkpoint is usable */
150+ CUDA_CHECK (cudaDeviceSynchronize ());
151+ CUDA_CHECK (cudaGetLastError ());
152+
153+ /* \todo in an ideal world with MPI-3, this would be an
154+ * MPI_Ibarrier call and this function would return a MPI_Request
155+ * that could be checked */
139156 MPI_CHECK (MPI_Barrier (gc.getCommunicator ().getMPIComm ()));
140157
141158 if (gc.getGlobalRank () == 0 )
0 commit comments