@@ -33,14 +33,24 @@ This is a template function that should be implemented for each component model.
33
33
get_model_cache (sim:: Interfacer.ComponentModelSimulation ) = nothing
34
34
35
35
"""
36
- checkpoint_model_state(sim::Interfacer.ComponentModelSimulation, comms_ctx::ClimaComms.AbstractCommsContext, t::Int; output_dir = "output")
36
+ checkpoint_model_state(
37
+ sim::Interfacer.ComponentModelSimulation,
38
+ comms_ctx::ClimaComms.AbstractCommsContext,
39
+ t::Int,
40
+ prev_checkpoint_t::Int;
41
+ output_dir = "output")
37
42
38
43
Checkpoint the model state of a simulation to a HDF5 file at a given time, t (in seconds).
44
+
45
+ If a previous checkpoint exists, it is removed. This is to avoid accumulating
46
+ many checkpoint files in the output directory. A value of -1 for `prev_checkpoint_t`
47
+ is used to indicate that there is no previous checkpoint to remove.
39
48
"""
40
49
function checkpoint_model_state (
41
50
sim:: Interfacer.ComponentModelSimulation ,
42
51
comms_ctx:: ClimaComms.AbstractCommsContext ,
43
- t:: Int ;
52
+ t:: Int ,
53
+ prev_checkpoint_t:: Int ;
44
54
output_dir = " output" ,
45
55
)
46
56
Y = get_model_prog_state (sim)
@@ -52,23 +62,36 @@ function checkpoint_model_state(
52
62
CC. InputOutput. HDF5. write_attribute (checkpoint_writer. file, " time" , t)
53
63
CC. InputOutput. write! (checkpoint_writer, Y, " model_state" )
54
64
Base. close (checkpoint_writer)
55
- return nothing
56
65
66
+ # Remove previous checkpoint if it exists
67
+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_$(nameof (sim)) _$(prev_checkpoint_t) .hdf5" )
68
+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
69
+ return nothing
57
70
end
58
71
59
72
"""
60
- checkpoint_model_cache(sim::Interfacer.ComponentModelSimulation, comms_ctx::ClimaComms.AbstractCommsContext, t::Int; output_dir = "output")
73
+ checkpoint_model_cache(
74
+ sim::Interfacer.ComponentModelSimulation,
75
+ comms_ctx::ClimaComms.AbstractCommsContext,
76
+ t::Int,
77
+ prev_checkpoint_t::Int;
78
+ output_dir = "output")
61
79
62
80
Checkpoint the model cache to N JLD2 files at a given time, t (in seconds),
63
81
where N is the number of MPI ranks.
64
82
65
83
Objects are saved to JLD2 files because caches are generally not ClimaCore
66
84
objects (and ClimaCore.InputOutput can only save `Field`s or `FieldVector`s).
85
+
86
+ If a previous checkpoint exists, it is removed. This is to avoid accumulating
87
+ many checkpoint files in the output directory. A value of -1 for `prev_checkpoint_t`
88
+ is used to indicate that there is no previous checkpoint to remove.
67
89
"""
68
90
function checkpoint_model_cache (
69
91
sim:: Interfacer.ComponentModelSimulation ,
70
92
comms_ctx:: ClimaComms.AbstractCommsContext ,
71
- t:: Int ;
93
+ t:: Int ,
94
+ prev_checkpoint_t:: Int ;
72
95
output_dir = " output" ,
73
96
)
74
97
# Move p to CPU (because we cannot save CUArrays)
@@ -79,6 +102,10 @@ function checkpoint_model_cache(
79
102
pid = ClimaComms. mypid (comms_ctx)
80
103
output_file = joinpath (output_dir, " checkpoint_cache_$(pid) _$(nameof (sim)) _$t .jld2" )
81
104
JLD2. jldsave (output_file, cache = p)
105
+
106
+ # Remove previous checkpoint if it exists
107
+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_cache_$(pid) _$(nameof (sim)) _$(prev_checkpoint_t) .jld2" )
108
+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
82
109
return nothing
83
110
end
84
111
@@ -104,21 +131,31 @@ function checkpoint_sims(cs::Interfacer.CoupledSimulation)
104
131
day = floor (Int, time / (60 * 60 * 24 ))
105
132
sec = floor (Int, time % (60 * 60 * 24 ))
106
133
output_dir = cs. dirs. checkpoints
134
+ prev_checkpoint_t = cs. prev_checkpoint_t[]
107
135
comms_ctx = ClimaComms. context (cs)
136
+
108
137
for sim in cs. model_sims
109
138
if ! isnothing (Checkpointer. get_model_prog_state (sim))
110
- Checkpointer. checkpoint_model_state (sim, comms_ctx, time; output_dir)
139
+ Checkpointer. checkpoint_model_state (sim, comms_ctx, time, prev_checkpoint_t ; output_dir)
111
140
end
112
141
if ! isnothing (Checkpointer. get_model_cache (sim))
113
- Checkpointer. checkpoint_model_cache (sim, comms_ctx, time; output_dir)
142
+ Checkpointer. checkpoint_model_cache (sim, comms_ctx, time, prev_checkpoint_t ; output_dir)
114
143
end
115
144
end
145
+
116
146
# Checkpoint the Coupler fields
117
147
pid = ClimaComms. mypid (comms_ctx)
118
148
@info " Saving coupler fields to JLD2 on day $day second $sec "
119
149
output_file = joinpath (output_dir, " checkpoint_coupler_fields_$(pid) _$time .jld2" )
120
150
# Adapt to Array move fields to the CPU
121
151
JLD2. jldsave (output_file, coupler_fields = CC. Adapt. adapt (Array, cs. fields))
152
+
153
+ # Remove previous Coupler fields checkpoint if it exists
154
+ prev_checkpoint_file = joinpath (output_dir, " checkpoint_coupler_fields_$(pid) _$(prev_checkpoint_t) .jld2" )
155
+ remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
156
+
157
+ # Update previous checkpoint time stored in the coupled simulation
158
+ cs. prev_checkpoint_t[] = time
122
159
end
123
160
124
161
"""
@@ -211,4 +248,19 @@ function t_start_from_checkpoint(checkpoint_dir)
211
248
return parse (Int, match (restart_file_rx, latest_restart)[2 ])
212
249
end
213
250
251
+ """
252
+ remove_checkpoint(prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
253
+
254
+ Delete the provided checkpoint file on the root process and print a helpful
255
+ info message. This can be used to remove intermediate checkpoints, to prevent
256
+ saving excessively large amounts of output.
257
+ """
258
+ function remove_checkpoint (prev_checkpoint_file, prev_checkpoint_t, comms_ctx)
259
+ if ClimaComms. iamroot (comms_ctx) && prev_checkpoint_t != - 1 && isfile (prev_checkpoint_file)
260
+ @info " Removing previous checkpoint file: $prev_checkpoint_file "
261
+ rm (prev_checkpoint_file)
262
+ end
263
+ return nothing
264
+ end
265
+
214
266
end # module
0 commit comments