diff --git a/.buildkite/Manifest-v1.11.toml b/.buildkite/Manifest-v1.11.toml index ab2e287997..50a697abfc 100644 --- a/.buildkite/Manifest-v1.11.toml +++ b/.buildkite/Manifest-v1.11.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.11.6" +julia_version = "1.11.7" manifest_format = "2.0" project_hash = "12784bf0b21aee2b4075f2bb64383303b89e8560" @@ -406,7 +406,9 @@ weakdeps = ["CUDA", "MPI"] [[deps.ClimaCore]] deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LazyBroadcast", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "UnrolledUtilities"] -git-tree-sha1 = "344711aa776e0bbd007ad127e5ba9f2113a1c88b" +git-tree-sha1 = "288de6398d559c7b544258566c0d7bb29575a456" +repo-rev = "cec2ef73a8412db82d146b3b9d4591588d589c67" +repo-url = "https://github.com/CliMA/ClimaCore.jl" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" version = "0.14.41" weakdeps = ["CUDA", "Krylov"] diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index bfd4ab5a30..aabccec630 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -20,7 +20,7 @@ steps: - label: "init :computer:" key: "init_cpu_env" concurrency: 1 - concurrency_group: 'depot/climaatmos-ci' + concurrency_group: "depot/climaatmos-ci" command: - "echo $$JULIA_DEPOT_PATH" @@ -41,13 +41,11 @@ steps: - group: "Reproducibility infrastructure" steps: - - label: ":computer: Test reproducibility infrastructure" command: "julia --color=yes --project=.buildkite test/unit_reproducibility_infra.jl" - group: "Radiation" steps: - - label: ":computer: single column radiative equilibrium gray" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -98,7 +96,6 @@ steps: - group: "Gravity wave" steps: - - label: ":computer: non-orographic gravity wave parameterization unit test 3d" command: "julia --color=yes --project=.buildkite test/parameterized_tendencies/gravity_wave/non_orographic_gravity_wave/nogw_test_3d.jl" artifact_paths: "nonorographic_gravity_wave_test_3d/*" @@ -132,7 +129,6 @@ steps: - group: "Column Examples" steps: - - label: ":computer: single column hydrostatic balance float64" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -149,7 +145,6 @@ steps: - group: "Box Examples" steps: - - label: ":computer: Box hydrostatic balance" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -188,7 +183,6 @@ steps: - group: "Plane Examples" steps: - - label: ":computer: Density current experiment" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -287,7 +281,6 @@ steps: - group: "Conservation check" steps: - - label: ":computer: baroclinic wave check conservation" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_conservation.yml @@ -324,7 +317,6 @@ steps: - group: "Sphere Examples (Dycore)" steps: - - label: ":computer: hydrostatic balance float64" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -394,7 +386,6 @@ steps: - group: "Sphere Examples (Aquaplanet)" steps: - - label: ":umbrella: aquaplanet nonequil allsky monin_obukhov varying insol gravity wave (gfdl_restart) high top 1-moment" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -462,7 +453,6 @@ steps: - group: "Sphere Examples (Topography)" steps: - - label: ":computer: baroclinic wave topography (dcmip)" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -489,7 +479,6 @@ steps: - group: "Restarting" steps: - - label: ":computer: test restart" command: > julia --color=yes --project=.buildkite test/restart.jl @@ -554,7 +543,6 @@ steps: - group: "MPI Examples" steps: - - label: ":computer: Prep restart for MPI" key: "mpi_baro_wave_make_restart" command: > @@ -624,7 +612,6 @@ steps: - group: "EDOnlyEDMFX" steps: - - label: ":man_in_business_suit_levitating: EDOnly EDMFX aquaplanet" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -647,7 +634,6 @@ steps: - group: "Diagnostic EDMFX" steps: - - label: ":genie: Diagnostic EDMFX test in a box" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -764,14 +750,12 @@ steps: - group: "Prognostic EDMFX" steps: - - label: ":genie: Prognostic EDMFX advection test in a column" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/prognostic_edmfx_adv_test_column.yml --job_id prognostic_edmfx_adv_test_column artifact_paths: "prognostic_edmfx_adv_test_column/output_active/*" - agents: slurm_mem: 20GB @@ -975,7 +959,6 @@ steps: - group: "Autodiff" steps: - - label: "baroclinic wave moist check conservation float64 sparse autodiff" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl --config_file $CONFIG_PATH/baroclinic_wave_equil_conservation_ft64_sparse_autodiff.yml @@ -1071,7 +1054,6 @@ steps: - group: "GPU" steps: - - label: "GPU: Gravity waves" command: > julia --color=yes --project=.buildkite .buildkite/ci_driver.jl @@ -1140,7 +1122,6 @@ steps: - "baroclinic_wave" - "baroclinic_wave_gpu" - - label: "GPU: baroclinic wave - 2 gpus" key: "baroclinic_wave_2gpu" command: @@ -1260,7 +1241,6 @@ steps: - group: "Benchmarks" steps: - - label: ":computer: Benchmark: CPU baroclinic wave moist" command: > julia --color=yes --project=.buildkite perf/benchmark_step.jl @@ -1278,6 +1258,7 @@ steps: artifact_paths: "bm_baroclinic_wave_moist_gpu/output_active/*" env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 16G slurm_gpus: 1 @@ -1299,6 +1280,7 @@ steps: --job_id bm_default_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 @@ -1311,6 +1293,7 @@ steps: --job_id bm_diag_edmf_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 @@ -1323,13 +1306,13 @@ steps: --job_id bm_prog_edmf_gpu env: CLIMACOMMS_DEVICE: "CUDA" + CLIMA_NAME_CUDA_KERNELS_FROM_STACK_TRACE: "true" agents: slurm_mem: 24GB slurm_gpus: 1 - group: "Flame graphs" steps: - - label: ":fire: Flame graph: gpu job" command: > julia --color=yes --project=.buildkite perf/flame.jl @@ -1447,7 +1430,6 @@ steps: - group: "Checkbounds/Inference/Invalidations" steps: - # TODO: we should somehow decouple this unit test from the perf env / scripts # Checkbounds - label: ":computer: checkbounds" diff --git a/perf/benchmark_step.jl b/perf/benchmark_step.jl index b5d717e9e5..99891c7295 100644 --- a/perf/benchmark_step.jl +++ b/perf/benchmark_step.jl @@ -18,6 +18,7 @@ import Random Random.seed!(1234) import ClimaAtmos as CA import ClimaComms +import CUDA include("common.jl") (; config_file, job_id) = CA.commandline_kwargs() @@ -26,17 +27,44 @@ config = CA.AtmosConfig(config_file; job_id) simulation = CA.get_simulation(config) (; integrator) = simulation; Y₀ = deepcopy(integrator.u); +# Run one step to compile @info "Compiling benchmark_step!..." -CA.benchmark_step!(integrator, Y₀); # compile first +CA.benchmark_step!(integrator, Y₀); @info "Running benchmark_step!..." -n_steps = 10 comms_ctx = ClimaComms.context(integrator.u.c) device = ClimaComms.device(comms_ctx) -local e -s = CA.@timed_str begin - e = ClimaComms.elapsed(device) do - CA.benchmark_step!(integrator, Y₀, n_steps) # run + +# If we're running on CUDA, use CUDA's profiler +if device isa ClimaComms.CUDADevice + e = 0.0 + n_steps = 5 + use_external_profiler = CUDA.Profile.detect_cupti() + if use_external_profiler + @info "Using external CUDA profiler" + CUDA.@profile external = true begin + e = CUDA.@elapsed begin + CA.benchmark_step!(integrator, Y₀, n_steps) + end + end + else + @info "Using internal CUDA profiler" + res = CUDA.@profile external = false begin + e = CUDA.@elapsed begin + CA.benchmark_step!(integrator, Y₀, n_steps) + end + end + show(IOContext(stdout, :limit => false), res) + end + @info "Ran step! with CUDA $n_steps times in $e s, ($(CA.prettytime(e/n_steps*1e9)) per step)" +else + # Profile with Julia's built-in profiler + n_steps = 10 + local e + s = CA.@timed_str begin + e = ClimaComms.elapsed(device) do + CA.benchmark_step!(integrator, Y₀, n_steps) # run + end end + @info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)" end -@info "Ran step! $n_steps times in $s, ($(CA.prettytime(e/n_steps*1e9)) per step)"