Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
abe1f57
Update benchmark_step.jl for CUDA profiling
petebachant Oct 6, 2025
cbad8ac
Fix external profiler determination
petebachant Oct 6, 2025
df5f349
Get kernel naming option from ClimaCore
petebachant Oct 7, 2025
606f584
Control kernel naming via env var
petebachant Oct 14, 2025
f369d6c
Use dev version of ClimaCore
petebachant Oct 14, 2025
e4ce7a2
Short-circuit GPU benchmark based on device
petebachant Oct 14, 2025
a521070
Rename kernels in buildkite
petebachant Oct 14, 2025
04e0454
Autoformat .buildkite/pipeline.yml
petebachant Oct 14, 2025
8a931f5
Improve logging
petebachant Oct 14, 2025
215255f
Always import CUDA
petebachant Oct 17, 2025
0b362e7
Name kernels from stack trace in benchmark GPU default
petebachant Oct 17, 2025
cba2694
Merge branch 'main' of https://github.com/CliMA/ClimaAtmos.jl into pb…
petebachant Oct 17, 2025
6620982
Set stacktrace-based kernel names before compiling
petebachant Oct 20, 2025
3dcca85
Print internal profling result in benchmark_step.jl
petebachant Oct 20, 2025
b228a35
Relocate function so it can be called
petebachant Oct 20, 2025
4439d43
Update ClimaCore dev dep
petebachant Oct 20, 2025
f36f405
Update ClimaCore
petebachant Oct 20, 2025
a09bbc0
Trigger build
petebachant Oct 20, 2025
317dca8
Merge main
petebachant Oct 21, 2025
1153ca3
Fix url
petebachant Oct 21, 2025
7aa7dea
Widen display size for CUDA profiling results
petebachant Oct 21, 2025
f4429ff
Narrow print
petebachant Oct 21, 2025
57fba6c
Remove limit
petebachant Oct 24, 2025
b073672
Update ClimaCore
petebachant Oct 24, 2025
3824cde
Merge branch 'main' of github.com:CliMA/ClimaAtmos.jl into pb/gpu-perf-2
petebachant Oct 24, 2025
cb6bc31
Update ClimaCore
petebachant Oct 24, 2025
e41ea59
Merge main
petebachant Oct 24, 2025
efb94cf
Update ClimaCore
petebachant Oct 24, 2025
ac635a4
Merge branch 'main' of https://github.com/CliMA/ClimaAtmos.jl into pb…
petebachant Oct 27, 2025
90ccf5b
Move CUDA kernel naming selection via env var into ClimaCore
petebachant Oct 27, 2025
63e213d
Update ClimaCore
petebachant Oct 27, 2025
512471e
Set kernel naming from stack trace enabled for entire buildkite pipeline
petebachant Oct 27, 2025
175164c
Update ClimaCore
petebachant Oct 27, 2025
7dd4645
Update ClimaCore
petebachant Nov 3, 2025
88cc058
Merge branch 'main' of https://github.com/CliMA/ClimaAtmos.jl into pb…
petebachant Nov 3, 2025
7e959cd
Update ClimaCore
petebachant Nov 4, 2025
77ef5e8
Update ClimaCore and only rename kernels in specific benchmarks
petebachant Nov 5, 2025
d7befca
Update ClimaAtmos
petebachant Nov 5, 2025
5559604
Update ClimaCore
petebachant Nov 5, 2025
adde261
Update comment
petebachant Nov 5, 2025
427b8db
Update comment
petebachant Nov 6, 2025
ad20c71
Revert "Update comment"
petebachant Nov 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .buildkite/Manifest-v1.11.toml
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,9 @@ weakdeps = ["CUDA", "MPI"]

[[deps.ClimaCore]]
deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"]
git-tree-sha1 = "4fdb73dfcf8dc1c9655c81c605989da89b18fdd1"
git-tree-sha1 = "6f1589a275831b575f103e82ceb2a4adee077b3f"
repo-rev = "cf830bf6263da935a3d34f4081f0a74a2f8378d5"
repo-url = "https://github.com/CliMA/ClimaCore.jl"
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
version = "0.14.40"
weakdeps = ["CUDA", "Krylov"]
Expand Down
35 changes: 7 additions & 28 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ steps:
- label: "init :computer:"
key: "init_cpu_env"
concurrency: 1
concurrency_group: 'depot/climaatmos-ci'
concurrency_group: "depot/climaatmos-ci"
command:
- "echo $$JULIA_DEPOT_PATH"

Expand All @@ -41,13 +41,11 @@ steps:

- group: "Reproducibility infrastructure"
steps:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes were made by a YAML auto-formatter in VS Code. Is there a style guide I might be breaking here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure... this is something I have been wondering as well. I considered following this example, which is used in Buildkite's docs.

- label: ":computer: Test reproducibility infrastructure"
command: "julia --color=yes --project=.buildkite test/unit_reproducibility_infra.jl"

- group: "Radiation"
steps:

- label: ":computer: single column radiative equilibrium gray"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -98,7 +96,6 @@ steps:

- group: "Gravity wave"
steps:

- label: ":computer: non-orographic gravity wave parameterization unit test 3d"
command: "julia --color=yes --project=.buildkite test/parameterized_tendencies/gravity_wave/non_orographic_gravity_wave/nogw_test_3d.jl"
artifact_paths: "nonorographic_gravity_wave_test_3d/*"
Expand Down Expand Up @@ -132,7 +129,6 @@ steps:

- group: "Column Examples"
steps:

- label: ":computer: single column hydrostatic balance float64"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -149,7 +145,6 @@ steps:

- group: "Box Examples"
steps:

- label: ":computer: Box hydrostatic balance"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -188,7 +183,6 @@ steps:

- group: "Plane Examples"
steps:

- label: ":computer: Density current experiment"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -279,7 +273,6 @@ steps:

- group: "Conservation check"
steps:

- label: ":computer: baroclinic wave check conservation"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_conservation.yml
Expand All @@ -305,7 +298,7 @@ steps:
artifact_paths: "baroclinic_wave_equil_conservation_source/output_active/*"
agents:
slurm_mem: 16GB

- label: ":computer: baroclinic wave nonequil moist check conservation with sources"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_nonequil_conservation_source.yml
Expand All @@ -316,7 +309,6 @@ steps:

- group: "Sphere Examples (Dycore)"
steps:

- label: ":computer: hydrostatic balance float64"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -374,7 +366,6 @@ steps:

- group: "Sphere Examples (Aquaplanet)"
steps:

- label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -442,7 +433,6 @@ steps:

- group: "Sphere Examples (Topography)"
steps:

- label: ":computer: baroclinic wave topography (dcmip)"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -469,7 +459,6 @@ steps:

- group: "Restarting"
steps:

- label: ":computer: test restart"
command: >
julia --color=yes --project=.buildkite test/restart.jl
Expand Down Expand Up @@ -534,7 +523,6 @@ steps:

- group: "MPI Examples"
steps:

- label: ":computer: Prep restart for MPI"
key: "mpi_baro_wave_make_restart"
command: >
Expand Down Expand Up @@ -604,7 +592,6 @@ steps:

- group: "EDOnlyEDMFX"
steps:

- label: ":man_in_business_suit_levitating: EDOnly EDMFX aquaplanet"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -627,7 +614,6 @@ steps:

- group: "Diagnostic EDMFX"
steps:

- label: ":genie: Diagnostic EDMFX test in a box"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -744,14 +730,12 @@ steps:

- group: "Prognostic EDMFX"
steps:

- label: ":genie: Prognostic EDMFX advection test in a column"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file $CONFIG_PATH/prognostic_edmfx_adv_test_column.yml
--job_id prognostic_edmfx_adv_test_column
artifact_paths: "prognostic_edmfx_adv_test_column/output_active/*"

agents:
slurm_mem: 20GB

Expand Down Expand Up @@ -853,7 +837,7 @@ steps:
artifact_paths: "prognostic_edmfx_rico_column/output_active/*"
agents:
slurm_mem: 20GB

- label: ":umbrella: Prognostic EDMFX Rico in a column (implicit)"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -946,7 +930,6 @@ steps:

- group: "Autodiff"
steps:

- label: "baroclinic wave moist check conservation float64 sparse autodiff"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml
Expand Down Expand Up @@ -989,7 +972,7 @@ steps:
artifact_paths: "prognostic_edmfx_bomex_column_sparse_autodiff/output_active/*"
agents:
slurm_mem: 20GB

- label: "Prognostic EDMFX Dycoms RF02 in a column sparse autodiff"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -999,7 +982,7 @@ steps:
agents:
slurm_mem: 20GB
soft_fail: true

- label: "Prognostic EDMFX TRMM in a column sparse autodiff"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -1029,7 +1012,7 @@ steps:
artifact_paths: "prognostic_edmfx_aquaplanet_sparse_autodiff/output_active/*"
agents:
slurm_mem: 20GB

- label: "AMIP Target EDOnly nonequil sparse autodiff"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand All @@ -1042,7 +1025,6 @@ steps:

- group: "GPU"
steps:

- label: "GPU: Gravity waves"
command: >
julia --color=yes --project=.buildkite .buildkite/ci_driver.jl
Expand Down Expand Up @@ -1111,7 +1093,6 @@ steps:
- "baroclinic_wave"
- "baroclinic_wave_gpu"


- label: "GPU: baroclinic wave - 2 gpus"
key: "baroclinic_wave_2gpu"
command:
Expand Down Expand Up @@ -1231,7 +1212,6 @@ steps:

- group: "Benchmarks"
steps:

- label: ":computer: Benchmark: CPU baroclinic wave moist"
command: >
julia --color=yes --project=.buildkite perf/benchmark_step.jl
Expand All @@ -1249,6 +1229,7 @@ steps:
artifact_paths: "bm_baroclinic_wave_moist_gpu/output_active/*"
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true"
agents:
slurm_mem: 16G
slurm_gpus: 1
Expand Down Expand Up @@ -1300,7 +1281,6 @@ steps:

- group: "Flame graphs"
steps:

- label: ":fire: Flame graph: gpu job"
command: >
julia --color=yes --project=.buildkite perf/flame.jl
Expand Down Expand Up @@ -1418,7 +1398,6 @@ steps:

- group: "Checkbounds/Inference/Invalidations"
steps:

# TODO: we should somehow decouple this unit test from the perf env / scripts
# Checkbounds
- label: ":computer: checkbounds"
Expand Down
63 changes: 57 additions & 6 deletions perf/benchmark_step.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,64 @@ Y₀ = deepcopy(integrator.u);
CA.benchmark_step!(integrator, Y₀); # compile first

@info "Running benchmark_step!..."
n_steps = 10
comms_ctx = ClimaComms.context(integrator.u.c)
device = ClimaComms.device(comms_ctx)
local e
s = CA.@timed_str begin
e = ClimaComms.elapsed(device) do
CA.benchmark_step!(integrator, Y₀, n_steps) # run

# Robustly parse boolean-like environment variables
function getenv_bool(var::AbstractString; default::Bool = false)
raw = get(ENV, var, nothing)
raw === nothing && return default
s = lowercase(strip(String(raw)))
if s in ("1", "true", "t", "yes", "y", "on")
return true
elseif s in ("0", "false", "f", "no", "n", "off")
return false
else
# fall back to parse as integer (non-zero -> true)
try
return parse(Int, s) != 0
catch
@warn "Unrecognized boolean env var value; using default" var = var val = raw default =
default
return default
end
end
end

# If we're running on CUDA, use CUDA's profiler
if ENV["CLIMACOMMS_DEVICE"] == "CUDA" && device isa ClimaComms.CUDADevice
import CUDA
if getenv_bool("CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE", default = false)
import ClimaCore
ClimaCore.DebugOnly.name_kernels_from_stack_trace() = true
end
e = 0.0
n_steps = 5
use_external_profiler = CUDA.Profile.detect_cupti()
if use_external_profiler
@info "Using external CUDA profiler"
CUDA.@profile external = true begin
e = CUDA.@elapsed begin
CA.benchmark_step!(integrator, Y₀, n_steps)
end
end
else
@info "Using internal CUDA profiler"
CUDA.@profile external = false begin
e = CUDA.@elapsed begin
CA.benchmark_step!(integrator, Y₀, n_steps)
end
end
end
@info "Ran step! with CUDA $n_steps times in $e s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
else
# Profile with Julia's built-in profiler
n_steps = 10
local e
s = CA.@timed_str begin
e = ClimaComms.elapsed(device) do
CA.benchmark_step!(integrator, Y₀, n_steps) # run
end
end
@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
end
@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"
Loading