Skip to content

Commit 5f79625

Browse files
authored
Merge pull request #1548 from CliMA/js/zs/soft-restarts
add an option `restart_cache`
2 parents b42ea0c + 1a09f06 commit 5f79625

File tree

8 files changed

+109
-7
lines changed

8 files changed

+109
-7
lines changed

.buildkite/pipeline.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,15 @@ steps:
127127
slurm_gres: "gpu:1"
128128
slurm_mem: 32GB
129129

130+
- label: "GPU restarts"
131+
command: "julia --color=yes --project=experiments/ClimaEarth/ experiments/ClimaEarth/test/restart_state_only.jl"
132+
env:
133+
CLIMACOMMS_DEVICE: "CUDA"
134+
agents:
135+
slurm_ntasks: 1
136+
slurm_gres: "gpu:1"
137+
slurm_mem: 32GB
138+
130139
- group: "Integration Tests"
131140
steps:
132141
# SLABPLANET EXPERIMENTS

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ ClimaCoupler.jl Release Notes
66

77
### ClimaCoupler features
88

9+
#### Add an option `restart_cache` PR[#1548](https://github.com/CliMA/ClimaCoupler.jl/pull/1548)
10+
Adds an option so that if restart files are available, we can choose to
11+
restart the state only (`restart_cache` false), or to restart both the state
12+
and the cache (`restart_cache` true). `restart_cache` is true by default.
13+
914
#### Remove `FluxCalculator.surface_inputs` helper function PR[#1543](https://github.com/CliMA/ClimaCoupler.jl/pull/1543)
1015
We can simplify the flux calculation by calling `SF.ValuesOnly` directly.
1116
Since we now remap all quantities onto the boundary space when we compute

experiments/ClimaEarth/cli_options.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ function argparse_settings()
100100
help = "Time in seconds rounded to the nearest index to use at `t_start` for restarted simulation [nothing (default)]"
101101
arg_type = Int
102102
default = nothing
103+
"--restart_cache"
104+
help = "Boolean flag indicating whether to read the cache from the restart file if available [`true` (default), `false`]"
105+
arg_type = Bool
106+
default = true
103107
# Diagnostics information
104108
"--use_coupler_diagnostics"
105109
help = "Boolean flag indicating whether to compute and output coupler diagnostics [`true` (default), `false`]"

experiments/ClimaEarth/components/atmosphere/climaatmos.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ function Checkpointer.restore_cache!(sim::ClimaAtmosSimulation, new_cache)
152152
:hyperdiffusion_ghost_buffer,
153153
:data_handler,
154154
:graph_context,
155+
:dt,
155156
]),
156157
)
157158
return nothing

experiments/ClimaEarth/setup_run.jl

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ function CoupledSimulation(config_dict::AbstractDict)
127127
detect_restart_files,
128128
restart_dir,
129129
restart_t,
130+
restart_cache,
130131
use_land_diagnostics,
131132
diagnostics_dt,
132133
evolving_ocean,
@@ -524,12 +525,19 @@ function CoupledSimulation(config_dict::AbstractDict)
524525

525526
#=
526527
## Restart component model states if specified
527-
If a restart directory is specified and contains output files from the `checkpoint_cb` callback, the component model states are restarted from those files. The restart directory
528-
is specified in the `config_dict` dictionary. The `restart_t` field specifies the time step at which the restart is performed.
528+
If a restart directory is specified and contains output files from the `checkpoint_cb` callback,
529+
the component model states are restarted from those files. The restart directory is specified in
530+
the `config_dict` dictionary. The `restart_t` field specifies the time step at which the restart
531+
is performed.
532+
533+
If `restart_cache` is true, the caches will be read from the restart file using `restore_cache!`.
534+
Otherwise, the caches will be initialized in each component model's constructor.
535+
When the caches are not read from the restart file, we have to perform the initial component
536+
model exchange so that `set_caches!` can be called to initialize the caches.
529537
=#
530-
should_restart && Checkpointer.restart!(cs, restart_dir, restart_t)
538+
should_restart && Checkpointer.restart!(cs, restart_dir, restart_t, restart_cache)
531539

532-
if !should_restart
540+
if !should_restart || !restart_cache
533541
#=
534542
## Initialize Component Model Exchange
535543
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# This test runs a small AMIP simulation twice times.
2+
#
3+
# - The first time the simulation is run for two steps
4+
# - The second time the simulation is run for two steps, but restarting from the
5+
# first simulation
6+
#
7+
# Since the caches are not read from the restart file, the results will not be
8+
# bit-wise identical to running the simulation without restarting. This test
9+
# only checks that a simulation can be restarted from the state only without erroring.
10+
#
11+
# The content of the simulation is not the most important, but it helps if it
12+
# has all of the complexity possible.
13+
14+
import ClimaComms
15+
ClimaComms.@import_required_backends
16+
import ClimaUtilities.OutputPathGenerator: maybe_wait_filesystem
17+
import YAML
18+
import Logging
19+
using Test
20+
21+
# Uncomment the following for cleaner output (but more difficult debugging)
22+
# Logging.disable_logging(Logging.Warn)
23+
24+
include("compare.jl")
25+
include("../setup_run.jl")
26+
27+
comms_ctx = ClimaComms.context()
28+
@info "Context: $(comms_ctx)"
29+
ClimaComms.init(comms_ctx)
30+
31+
# Make sure that all MPI processes agree on the output_loc
32+
tmpdir = ClimaComms.iamroot(comms_ctx) ? mktempdir(pwd()) : ""
33+
tmpdir = ClimaComms.bcast(comms_ctx, tmpdir)
34+
# Sometimes the shared filesystem doesn't work properly and the folder is not
35+
# synced across MPI processes. Let's add an additional check here.
36+
maybe_wait_filesystem(ClimaComms.context(), tmpdir)
37+
38+
# Parse the input config file as a dictionary
39+
config_file = joinpath(@__DIR__, "amip_test.yml")
40+
config_dict = get_coupler_config_dict(config_file)
41+
42+
# Four steps
43+
two_steps = deepcopy(config_dict)
44+
45+
two_steps["dt"] = "180secs"
46+
two_steps["dt_cpl"] = "180secs"
47+
two_steps["t_end"] = "360secs"
48+
two_steps["dt_rad"] = "180secs"
49+
two_steps["checkpoint_dt"] = "360secs"
50+
two_steps["coupler_output_dir"] = tmpdir
51+
two_steps["job_id"] = "two_steps"
52+
53+
println("Simulating two steps")
54+
cs_two_steps = setup_and_run(two_steps)
55+
56+
# Check that we can pick up a simulation by providing t_restart and restart_dir
57+
println("Simulating two steps, options from command line")
58+
two_steps_reading = deepcopy(two_steps)
59+
60+
two_steps_reading["t_end"] = "540secs"
61+
two_steps_reading["detect_restart_files"] = true
62+
two_steps_reading["restart_dir"] = cs_two_steps.dir_paths.checkpoints_dir
63+
two_steps_reading["restart_t"] = 360
64+
two_steps_reading["restart_cache"] = false
65+
two_steps_reading["job_id"] = "two_steps_reading"
66+
67+
cs_two_steps_reading = setup_and_run(two_steps_reading)
68+
@testset "Restarts from command line arguments" begin
69+
@test cs_two_steps_reading.tspan[1] == cs_two_steps.tspan[2]
70+
end

experiments/ClimaEarth/user_io/arg_parsing.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ function get_coupler_args(config_dict::Dict)
145145
detect_restart_files = config_dict["detect_restart_files"]
146146
restart_dir = config_dict["restart_dir"]
147147
restart_t = config_dict["restart_t"]
148+
restart_cache = config_dict["restart_cache"]
148149

149150
# Diagnostics information
150151
use_coupler_diagnostics = config_dict["use_coupler_diagnostics"]
@@ -188,6 +189,7 @@ function get_coupler_args(config_dict::Dict)
188189
detect_restart_files,
189190
restart_dir,
190191
restart_t,
192+
restart_cache,
191193
use_coupler_diagnostics,
192194
diagnostics_dt,
193195
evolving_ocean,

src/Checkpointer.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,16 @@ function checkpoint_sims(cs::Interfacer.CoupledSimulation)
176176
end
177177

178178
"""
179-
restart!(cs::CoupledSimulation, checkpoint_dir, checkpoint_t)
179+
restart!(cs::CoupledSimulation, checkpoint_dir, checkpoint_t, restart_cache)
180180
181181
Overwrite the content of `cs` with checkpoints in `checkpoint_dir` at time `checkpoint_t`.
182182
183+
If `restart_cache` is true, the cache will be read from the restart file using `restore_cache!`.
184+
Otherwise, the cache will be left unchanged.
185+
183186
Return a true if the simulation was restarted.
184187
"""
185-
function restart!(cs, checkpoint_dir, checkpoint_t)
188+
function restart!(cs, checkpoint_dir, checkpoint_t, restart_cache)
186189
@info "Restarting from time $(checkpoint_t) and directory $(checkpoint_dir)"
187190
pid = ClimaComms.mypid(ClimaComms.context(cs))
188191
for sim in cs.model_sims
@@ -194,7 +197,7 @@ function restart!(cs, checkpoint_dir, checkpoint_t)
194197
)
195198
restart_model_state!(sim, input_file_state, ClimaComms.context(cs))
196199
end
197-
if !isnothing(Checkpointer.get_model_cache(sim))
200+
if !isnothing(Checkpointer.get_model_cache(sim)) && restart_cache
198201
input_file_cache = joinpath(
199202
checkpoint_dir,
200203
"checkpoint_cache_$(pid)_$(nameof(sim))_$(checkpoint_t).jld2",

0 commit comments

Comments
 (0)