Skip to content

Commit 850aa7d

Browse files
Merge pull request #3859 from CliMA/dy/auto_sparse_jacobian
Add AutoSparseJacobian algorithm for implicit solver
2 parents a7c139e + a295db6 commit 850aa7d

27 files changed

+1640
-190
lines changed

.buildkite/Manifest-v1.11.toml

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,16 +363,16 @@ version = "0.5.18"
363363
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
364364

365365
[[deps.ClimaAtmos]]
366-
deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"]
366+
deps = ["Adapt", "ArgParse", "Artifacts", "AtmosphericProfilesLibrary", "ClimaComms", "ClimaCore", "ClimaDiagnostics", "ClimaInterpolations", "ClimaParams", "ClimaTimeSteppers", "ClimaUtilities", "CloudMicrophysics", "Dates", "ForwardDiff", "Insolation", "Interpolations", "LazyArtifacts", "LazyBroadcast", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "NullBroadcasts", "RRTMGP", "Random", "SciMLBase", "SparseMatrixColorings", "StaticArrays", "Statistics", "SurfaceFluxes", "Thermodynamics", "UnrolledUtilities", "YAML"]
367367
path = ".."
368368
uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717"
369369
version = "0.31.0"
370370

371371
[[deps.ClimaComms]]
372372
deps = ["Adapt", "Logging", "LoggingExtras"]
373-
git-tree-sha1 = "75b9d1a3b4e3efa2cbbae2eb7b52f14c0b38ccf0"
373+
git-tree-sha1 = "f3961fa943c1bbbc376af910cb98db67084dc642"
374374
uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
375-
version = "0.6.8"
375+
version = "0.6.9"
376376
weakdeps = ["CUDA", "MPI"]
377377

378378
[deps.ClimaComms.extensions]
@@ -2318,6 +2318,20 @@ deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
23182318
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
23192319
version = "1.11.0"
23202320

2321+
[[deps.SparseMatrixColorings]]
2322+
deps = ["ADTypes", "DocStringExtensions", "LinearAlgebra", "PrecompileTools", "Random", "SparseArrays"]
2323+
git-tree-sha1 = "ab958b4fec46d1f1d057bb8e2a99bfdb90744646"
2324+
uuid = "0a514795-09f3-496d-8182-132a7b665d35"
2325+
version = "0.4.20"
2326+
2327+
[deps.SparseMatrixColorings.extensions]
2328+
SparseMatrixColoringsCliqueTreesExt = "CliqueTrees"
2329+
SparseMatrixColoringsColorsExt = "Colors"
2330+
2331+
[deps.SparseMatrixColorings.weakdeps]
2332+
CliqueTrees = "60701a23-6482-424a-84db-faee86b9b1f8"
2333+
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
2334+
23212335
[[deps.SpecialFunctions]]
23222336
deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
23232337
git-tree-sha1 = "41852b8679f78c8d8961eeadc8f62cef861a52e3"

.buildkite/ci_driver.jl

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,21 @@ using Test
3939
import Tar
4040
import Base.Filesystem: rm
4141
import Statistics: mean
42-
import LinearAlgebra: norm_sqr
42+
import LinearAlgebra: norm_sqr, diag, UniformScaling
4343
include(joinpath(pkgdir(CA), "post_processing", "ci_plots.jl"))
4444

4545
ref_job_id = config.parsed_args["reference_job_id"]
4646
reference_job_id = isnothing(ref_job_id) ? simulation.job_id : ref_job_id
4747

48+
if (
49+
config.parsed_args["debug_jacobian"] &&
50+
!config.parsed_args["use_dense_jacobian"]
51+
)
52+
@info "Debugging Jacobian in first column of final state"
53+
include(joinpath(@__DIR__, "..", "post_processing", "jacobian_summary.jl"))
54+
print_jacobian_summary(integrator)
55+
end
56+
4857
if sol_res.ret_code == :simulation_crashed
4958
error(
5059
"The ClimaAtmos simulation has crashed. See the stack trace for details.",

.buildkite/pipeline.yml

Lines changed: 110 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,13 @@ steps:
292292
--job_id baroclinic_wave_equil_conservation_ft64
293293
artifact_paths: "baroclinic_wave_equil_conservation_ft64/output_active/*"
294294

295+
- label: ":computer: baroclinic wave moist check conservation float64 sparse autodiff"
296+
command: >
297+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml
298+
--job_id baroclinic_wave_equil_conservation_ft64_sparse_autodiff
299+
artifact_paths: "baroclinic_wave_equil_conservation_ft64_sparse_autodiff/output_active/*"
300+
soft_fail: true
301+
295302
- label: ":computer: baroclinic wave moist check conservation with sources"
296303
command: >
297304
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_source.yml
@@ -328,6 +335,14 @@ steps:
328335
--job_id baroclinic_wave_dense_autodiff
329336
artifact_paths: "baroclinic_wave_dense_autodiff/output_active/*"
330337

338+
- label: ":computer: baroclinic wave sparse autodiff"
339+
key: baroclinic_wave_sparse_autodiff
340+
command: >
341+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
342+
--config_file $CONFIG_PATH/baroclinic_wave_sparse_autodiff.yml
343+
--job_id baroclinic_wave_sparse_autodiff
344+
artifact_paths: "baroclinic_wave_sparse_autodiff/output_active/*"
345+
331346
- label: ":computer: no lim baroclinic wave equilmoist"
332347
command: >
333348
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
@@ -379,6 +394,26 @@ steps:
379394
slurm_mem: 20GB
380395
slurm_constraint: icelake|cascadelake|skylake|epyc
381396

397+
- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment dense autodiff"
398+
command: >
399+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
400+
--config_file $CONFIG_PATH/aquaplanet_nonequil_allsky_gw_res_dense_autodiff.yml
401+
--job_id aquaplanet_nonequil_allsky_gw_res_dense_autodiff
402+
artifact_paths: "aquaplanet_nonequil_allsky_gw_res_dense_autodiff/output_active/*"
403+
agents:
404+
slurm_mem: 20GB
405+
slurm_constraint: icelake|cascadelake|skylake|epyc
406+
407+
- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment sparse autodiff"
408+
command: >
409+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
410+
--config_file $CONFIG_PATH/aquaplanet_nonequil_allsky_gw_res_sparse_autodiff.yml
411+
--job_id aquaplanet_nonequil_allsky_gw_res_sparse_autodiff
412+
artifact_paths: "aquaplanet_nonequil_allsky_gw_res_sparse_autodiff/output_active/*"
413+
agents:
414+
slurm_mem: 20GB
415+
slurm_constraint: icelake|cascadelake|skylake|epyc
416+
382417
- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 2-moment"
383418
command: >
384419
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
@@ -393,16 +428,6 @@ steps:
393428
slurm_mem: 20GB
394429
slurm_constraint: icelake|cascadelake|skylake|epyc
395430

396-
- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment dense autodiff"
397-
command: >
398-
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
399-
--config_file $CONFIG_PATH/aquaplanet_nonequil_allsky_gw_res_dense_autodiff.yml
400-
--job_id aquaplanet_nonequil_allsky_gw_res_dense_autodiff
401-
artifact_paths: "aquaplanet_nonequil_allsky_gw_res_dense_autodiff/output_active/*"
402-
agents:
403-
slurm_mem: 20GB
404-
slurm_constraint: icelake|cascadelake|skylake|epyc
405-
406431
- label: ":computer: aquaplanet equil allsky monin_obukhov varying insol gravity wave (raw_topo) high top zonally asymmetric"
407432
command: >
408433
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
@@ -633,6 +658,15 @@ steps:
633658
agents:
634659
slurm_mem: 20GB
635660

661+
- label: ":man_in_business_suit_levitating: AMIP Target EDOnly nonequil sparse autodiff"
662+
command: >
663+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
664+
--config_file $CONFIG_PATH/amip_target_edonly_nonequil_sparse_autodiff.yml
665+
--job_id amip_target_edonly_nonequil_sparse_autodiff
666+
artifact_paths: "amip_target_edonly_nonequil_sparse_autodiff/output_active/*"
667+
agents:
668+
slurm_mem: 40GB
669+
636670
- group: "Diagnostic EDMFX"
637671
steps:
638672

@@ -759,6 +793,16 @@ steps:
759793
slurm_mem: 20GB
760794
slurm_constraint: icelake|cascadelake|skylake|epyc
761795

796+
- label: ":genie: Diagnostic EDMFX aquaplanet sparse autodiff"
797+
command: >
798+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
799+
--config_file $CONFIG_PATH/diagnostic_edmfx_aquaplanet_sparse_autodiff.yml
800+
--job_id diagnostic_edmfx_aquaplanet_sparse_autodiff
801+
artifact_paths: "diagnostic_edmfx_aquaplanet_sparse_autodiff/output_active/*"
802+
agents:
803+
slurm_mem: 20GB
804+
slurm_constraint: icelake|cascadelake|skylake|epyc
805+
762806
- group: "Prognostic EDMFX"
763807
steps:
764808

@@ -943,6 +987,16 @@ steps:
943987
slurm_mem: 20GB
944988
slurm_constraint: icelake|cascadelake|skylake|epyc
945989

990+
- label: ":genie: Prognostic EDMFX aquaplanet sparse autodiff"
991+
command: >
992+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
993+
--config_file $CONFIG_PATH/prognostic_edmfx_aquaplanet_sparse_autodiff.yml
994+
--job_id prognostic_edmfx_aquaplanet_sparse_autodiff
995+
artifact_paths: "prognostic_edmfx_aquaplanet_sparse_autodiff/output_active/*"
996+
agents:
997+
slurm_mem: 20GB
998+
slurm_constraint: icelake|cascadelake|skylake|epyc
999+
9461000
- group: "GPU"
9471001
steps:
9481002

@@ -984,6 +1038,19 @@ steps:
9841038
slurm_gpus: 1
9851039
slurm_mem: 16GB
9861040

1041+
- label: "GPU: baroclinic wave sparse autodiff"
1042+
key: "baroclinic_wave_gpu_sparse_autodiff"
1043+
command: >
1044+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
1045+
--config_file $CONFIG_PATH/baroclinic_wave_sparse_autodiff.yml
1046+
--job_id baroclinic_wave_gpu_sparse_autodiff
1047+
artifact_paths: "baroclinic_wave_gpu_sparse_autodiff/output_active/*"
1048+
env:
1049+
CLIMACOMMS_DEVICE: "CUDA"
1050+
agents:
1051+
slurm_gpus: 1
1052+
slurm_mem: 16GB
1053+
9871054
- label: "GPU: compare BW with CPU"
9881055
command: >
9891056
tar xvf baroclinic_wave/output_active/hdf5_files.tar -C baroclinic_wave
@@ -1085,6 +1152,30 @@ steps:
10851152
slurm_gpus: 1
10861153
slurm_mem: 20GB
10871154

1155+
- label: "GPU: Prognostic EDMFX aquaplanet dense autodiff"
1156+
command: >
1157+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
1158+
--config_file $CONFIG_PATH/prognostic_edmfx_aquaplanet_gpu_dense_autodiff.yml
1159+
--job_id prognostic_edmfx_aquaplanet_gpu_dense_autodiff
1160+
artifact_paths: "prognostic_edmfx_aquaplanet_gpu_dense_autodiff/output_active/*"
1161+
env:
1162+
CLIMACOMMS_DEVICE: "CUDA"
1163+
agents:
1164+
slurm_gpus: 1
1165+
slurm_mem: 20GB
1166+
1167+
- label: "GPU: Prognostic EDMFX aquaplanet sparse autodiff"
1168+
command: >
1169+
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
1170+
--config_file $CONFIG_PATH/prognostic_edmfx_aquaplanet_gpu_sparse_autodiff.yml
1171+
--job_id prognostic_edmfx_aquaplanet_gpu_sparse_autodiff
1172+
artifact_paths: "prognostic_edmfx_aquaplanet_gpu_sparse_autodiff/output_active/*"
1173+
env:
1174+
CLIMACOMMS_DEVICE: "CUDA"
1175+
agents:
1176+
slurm_gpus: 1
1177+
slurm_mem: 20GB
1178+
10881179
- group: "Benchmarks"
10891180
steps:
10901181

@@ -1218,6 +1309,15 @@ steps:
12181309
agents:
12191310
slurm_mem: 32GB
12201311

1312+
- label: ":fire: Flame graph: prognostic edmf sparse autodiff"
1313+
command: >
1314+
julia --color=yes --project=.buildkite perf/flame.jl
1315+
--config_file $PERF_CONFIG_PATH/bm_aquaplanet_progedmf_sparse_autodiff.yml
1316+
--job_id flame_aquaplanet_progedmf_sparse_autodiff
1317+
artifact_paths: "flame_aquaplanet_progedmf_sparse_autodiff/*"
1318+
agents:
1319+
slurm_mem: 32GB
1320+
12211321
- label: ":fire: Flame graph: diffusion"
12221322
command: >
12231323
julia --color=yes --project=.buildkite perf/flame.jl

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ NullBroadcasts = "0d71be07-595a-4f89-9529-4065a4ab43a6"
3030
RRTMGP = "a01a1ee8-cea4-48fc-987c-fc7878d79da1"
3131
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
3232
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
33+
SparseMatrixColorings = "0a514795-09f3-496d-8182-132a7b665d35"
3334
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
3435
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
3536
SurfaceFluxes = "49b00bb7-8bd4-4f2b-b78c-51cd0450215f"
@@ -42,7 +43,7 @@ Adapt = "4"
4243
ArgParse = "1"
4344
Artifacts = "1"
4445
AtmosphericProfilesLibrary = "0.1.7"
45-
ClimaComms = "0.6.8"
46+
ClimaComms = "0.6.9"
4647
ClimaCore = "0.14.37"
4748
ClimaDiagnostics = "0.2.12"
4849
ClimaInterpolations = "0.1.0"
@@ -64,6 +65,7 @@ NullBroadcasts = "0.1"
6465
RRTMGP = "0.21.3"
6566
Random = "1"
6667
SciMLBase = "2.12"
68+
SparseMatrixColorings = "0.4.20"
6769
StaticArrays = "1.9"
6870
Statistics = "1"
6971
SurfaceFluxes = "0.11, 0.12"

config/default_configs/default_config.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,18 @@ jvp_step_adjustment:
8989
use_dense_jacobian:
9090
help: "Whether to use a dense Jacobian matrix that is computed using forward-mode automatic differentiation and inverted using LU factorization [`true`, `false` (default)]"
9191
value: false
92+
use_auto_jacobian:
93+
help: "Whether to populate the entries of the sparse Jacobian matrix using forward-mode automatic differentiation with sparse matrix coloring (only used when `use_dense_jacobian` is `false`) [`true`, `false` (default)]"
94+
value: false
95+
auto_jacobian_padding_bands:
96+
help: "Target number of bands to add in every block of the sparse Jacobian matrix, eliminating errors from Jacobian entries that lie outside of the default sparsity pattern; when unspecified, each block gets a predetermined number of padding bands based on the typical magnitudes of its entries (only used when `use_auto_jacobian` is `true`)"
97+
value: ~
9298
update_jacobian_every:
9399
help: "Frequency at which the Jacobian matrix should be updated (once per timestep, once per timestepper stage, or once per linear solve) [`dt`, `stage`, `solve` (default)]"
94100
value: "solve"
101+
debug_jacobian:
102+
help: "Whether to print summary information about the Jacobian matrix, including comparisons of different algorithms evaluated on the first column of the final state against the exact Jacobian [`true`, `false` (default)]"
103+
value: false
95104
# Radiation
96105
rad:
97106
help: "Radiation model [`nothing` (default), `gray`, `clearsky`, `allsky`, `allskywithclear`]"
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
h_elem: 16
2+
z_max: 60000.0
3+
z_elem: 63
4+
dz_bottom: 30.0
5+
rayleigh_sponge: true
6+
viscous_sponge: true
7+
dt_save_state_to_disk: "30days"
8+
cloud_model: "grid_scale"
9+
moist: "nonequil"
10+
implicit_noneq_cloud_formation: true
11+
use_auto_jacobian: true
12+
update_jacobian_every: dt
13+
debug_jacobian: true
14+
precip_model: "1M"
15+
rad: "allskywithclear"
16+
dt_rad: "1hours"
17+
dt_cloud_fraction: "1hours"
18+
insolation: "timevarying"
19+
co2_model: maunaloa
20+
prescribe_ozone: true
21+
aerosol_radiation: true
22+
edmfx_sgs_diffusive_flux: true
23+
prescribed_aerosols: ["CB1", "CB2", "DST01", "DST02", "DST03", "DST04", "DST05", "OC1", "OC2", "SO4", "SSLT01", "SSLT02", "SSLT03", "SSLT04", "SSLT05"]
24+
surface_setup: "DefaultMoninObukhov"
25+
turbconv: "edonly_edmfx"
26+
implicit_diffusion: true
27+
approximate_linear_solve_iters: 2
28+
dt: "120secs"
29+
t_end: "10mins"
30+
toml: [toml/short_aquaplanet_nonequil.toml]
31+
diagnostics:
32+
- short_name: [ta, ua, wa, va, rhoa, hur, hus, hussn, husra, clw, cli]
33+
period: 1hours
34+
netcdf_output_at_levels: true
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
z_max: 60000.0
2+
z_elem: 31
3+
dz_bottom: 50.0
4+
rayleigh_sponge: true
5+
dt: "360secs"
6+
t_end: "6hours" # This takes 2 hours to run in CI when t_end is 1 day.
7+
dt_save_state_to_disk: "6hours"
8+
vert_diff: "DecayWithHeightDiffusion"
9+
implicit_diffusion: true
10+
use_auto_jacobian: true
11+
update_jacobian_every: dt
12+
debug_jacobian: true
13+
moist: "nonequil"
14+
precip_model: "1M"
15+
rad: "allskywithclear"
16+
aerosol_radiation: true
17+
prescribe_clouds_in_radiation: true
18+
radiation_reset_rng_seed: true
19+
insolation: "timevarying"
20+
non_orographic_gravity_wave: true
21+
orographic_gravity_wave: "gfdl_restart"
22+
surface_setup: "DefaultMoninObukhov"
23+
prescribe_ozone: true
24+
prescribed_aerosols: ["SO4", "CB1", "OC1", "DST01", "SSLT01"]
25+
toml: [toml/sphere_aquaplanet_1M.toml]
26+
diagnostics:
27+
- short_name: [edt, evu, mmrso4, mmrbcpo, mmrocpo, mmrdust, mmrss, loadss, o3, od550aer, odsc550aer]
28+
reduction_time: average
29+
period: 6hours
30+
- short_name: [reffclw, reffcli]
31+
period: 6hours
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FLOAT_TYPE: "Float64"
2+
initial_condition: "MoistBaroclinicWave"
3+
dt: "400secs"
4+
t_end: "5days"
5+
moist: "equil"
6+
disable_surface_flux_tendency: true
7+
use_auto_jacobian: true
8+
update_jacobian_every: dt
9+
debug_jacobian: true
10+
dt_save_state_to_disk: "5days"
11+
check_conservation: true
12+
diagnostics:
13+
- short_name: [massa, energya]
14+
period: 1days
15+
writer: dict
16+
use_itime: true
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
dt_save_state_to_disk: "2days"
2+
initial_condition: "DryBaroclinicWave"
3+
use_auto_jacobian: true
4+
update_jacobian_every: dt
5+
debug_jacobian: true
6+
dt: "400secs"
7+
t_end: "10days"
8+
disable_surface_flux_tendency: true
9+
diagnostics:
10+
- short_name: [pfull, ua, wa, va, rv, ta, ke]
11+
period: 1days
12+
- short_name: [pfull, ua, wa, va, rv, ta, ke]
13+
period: 1days
14+
writer: h5

0 commit comments

Comments
 (0)