From c632a54c05641269ba79108c0406e036a89942ba Mon Sep 17 00:00:00 2001 From: Taimoor Sohail Date: Mon, 13 Oct 2025 14:47:43 +1100 Subject: [PATCH 01/15] add distributed models test --- test/runtests.jl | 1 + test/test_distributed_models.jl | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 test/test_distributed_models.jl diff --git a/test/runtests.jl b/test/runtests.jl index ebbeaeb7c..0b77a1eef 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -111,6 +111,7 @@ end if test_group == :distributed || test_group == :all include("test_distributed_utils.jl") + include("test_distributed_models.jl") end if test_group == :reactant || test_group == :all diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl new file mode 100644 index 000000000..6a7947513 --- /dev/null +++ b/test/test_distributed_models.jl @@ -0,0 +1,59 @@ +include("runtests_setup.jl") + +using MPI + +MPI.Init() +atexit(MPI.Finalize) + +using Oceananigans.Units +using Oceananigans.DistributedComputations +using Oceananigans.Architectures: on_architecture +using Dates +using ClimaSeaIce +using ClimaSeaIce.SeaIceThermodynamics: IceWaterThermalEquilibrium + +archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true), + Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)] + +function analytical_immersed_tripolar_grid(underlying_grid::TripolarGrid; radius = 5, active_cells_map = false) # degrees + λp = underlying_grid.conformal_mapping.first_pole_longitude + φp = underlying_grid.conformal_mapping.north_poles_latitude + φm = underlying_grid.conformal_mapping.southernmost_latitude + + Lz = underlying_grid.Lz + + # We need a bottom height field that ``masks'' the singularities + bottom_height(λ, φ) = ((abs(λ - λp) < radius) & (abs(φp - φ) < radius)) | + ((abs(λ - λp - 180) < radius) & (abs(φp - φ) < radius)) | (φ < φm) ? 0 : - Lz + + grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(bottom_height); active_cells_map) + + return grid +end + +@testset "Distributed Models" begin + for arch in archs + @info "Testing on architecture: $arch" + + Nx, Ny, Nz = 100, 100, 30 + underlying_grid = TripolarGrid(arch; size = (Nx, Ny, Nz), z = (-6000, 0), halo = (7, 7, 4)) + grid = analytical_immersed_tripolar_grid(underlying_grid; active_cells_map=true) + free_surface = SplitExplicitFreeSurface(grid; cfl=0.7, fixed_Δt=10minutes) + + ocean = ocean_simulation(grid; Δt=1minutes, free_surface, timestepper = :SplitRungeKutta3) + sea_ice = sea_ice_simulation(grid, ocean; advection=WENO(order=7)) + + set!(sea_ice.model, h=Metadatum(:sea_ice_thickness; dataset=ECCO4Monthly()), + ℵ=Metadatum(:sea_ice_concentration; dataset=ECCO4Monthly())) + + radiation = Radiation(arch) + atmosphere = JRA55PrescribedAtmosphere(arch; backend=JRA55NetCDFBackend(5)) + + coupled_model = OceanSeaIceModel(ocean, sea_ice; atmosphere, radiation) + + Δt=10 + simulation = Simulation(coupled_model; Δt, verbose=false, stop_time=5Δt) + + run!(simulation) + end +end From dd98f29ad35270bb06ce6e5060d31ba1d5a39141 Mon Sep 17 00:00:00 2001 From: Taimoor Sohail Date: Mon, 13 Oct 2025 14:48:25 +1100 Subject: [PATCH 02/15] temporarily deactivate distr utils tests --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0b77a1eef..f2001d76d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -110,7 +110,7 @@ if test_group == :ocean_sea_ice_model || test_group == :all end if test_group == :distributed || test_group == :all - include("test_distributed_utils.jl") + # include("test_distributed_utils.jl") include("test_distributed_models.jl") end From 8c880930f841eb4b597dc9b9b6b41035bd86d45f Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Mon, 13 Oct 2025 18:56:41 +1100 Subject: [PATCH 03/15] Update test/test_distributed_models.jl Co-authored-by: Simone Silvestri --- test/test_distributed_models.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index 6a7947513..7efa40198 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -55,5 +55,6 @@ end simulation = Simulation(coupled_model; Δt, verbose=false, stop_time=5Δt) run!(simulation) + @test coupled_model.clock.iteration == 5 end end From 4c1c696c3dfa4fb15854e42121f249e88f81535b Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Mon, 13 Oct 2025 19:00:48 +1100 Subject: [PATCH 04/15] Uncomment inclusion of test_distributed_utils.jl --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index f2001d76d..0b77a1eef 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -110,7 +110,7 @@ if test_group == :ocean_sea_ice_model || test_group == :all end if test_group == :distributed || test_group == :all - # include("test_distributed_utils.jl") + include("test_distributed_utils.jl") include("test_distributed_models.jl") end From 18a7c75bad3a61d83c02a5f7858d62f766388502 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Mon, 13 Oct 2025 19:01:29 +1100 Subject: [PATCH 05/15] Refactor simulation stop time calculation --- test/test_distributed_models.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index 7efa40198..c2cd54eec 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -52,9 +52,11 @@ end coupled_model = OceanSeaIceModel(ocean, sea_ice; atmosphere, radiation) Δt=10 - simulation = Simulation(coupled_model; Δt, verbose=false, stop_time=5Δt) + stop_iteration = 5 + simulation = Simulation(coupled_model; Δt, verbose=false, stop_time=stop_iteration * Δt) run!(simulation) - @test coupled_model.clock.iteration == 5 + + @test coupled_model.clock.iteration == stop_iteration end end From 03e24b9c8c23270b69dd75b0fb19e39a0d62faa5 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Mon, 13 Oct 2025 19:03:33 +1100 Subject: [PATCH 06/15] Refactor time step variable for ocean simulation --- test/test_distributed_models.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index c2cd54eec..9450e8a2d 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -40,7 +40,9 @@ end grid = analytical_immersed_tripolar_grid(underlying_grid; active_cells_map=true) free_surface = SplitExplicitFreeSurface(grid; cfl=0.7, fixed_Δt=10minutes) - ocean = ocean_simulation(grid; Δt=1minutes, free_surface, timestepper = :SplitRungeKutta3) + Δt = 10 + + ocean = ocean_simulation(grid; Δt, free_surface, timestepper = :SplitRungeKutta3) sea_ice = sea_ice_simulation(grid, ocean; advection=WENO(order=7)) set!(sea_ice.model, h=Metadatum(:sea_ice_thickness; dataset=ECCO4Monthly()), @@ -51,7 +53,6 @@ end coupled_model = OceanSeaIceModel(ocean, sea_ice; atmosphere, radiation) - Δt=10 stop_iteration = 5 simulation = Simulation(coupled_model; Δt, verbose=false, stop_time=stop_iteration * Δt) From 0b7b672f3f880baa93ee96e32b19380a247ddfdd Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Tue, 14 Oct 2025 08:09:05 +1100 Subject: [PATCH 07/15] Refactor function definition for clarity --- test/test_distributed_models.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index 9450e8a2d..3cb0ca68e 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -15,7 +15,10 @@ using ClimaSeaIce.SeaIceThermodynamics: IceWaterThermalEquilibrium archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true), Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)] -function analytical_immersed_tripolar_grid(underlying_grid::TripolarGrid; radius = 5, active_cells_map = false) # degrees +function analytical_immersed_tripolar_grid(underlying_grid::TripolarGrid; + radius = 5, # degrees + active_cells_map = false) + λp = underlying_grid.conformal_mapping.first_pole_longitude φp = underlying_grid.conformal_mapping.north_poles_latitude φm = underlying_grid.conformal_mapping.southernmost_latitude From 994f7b6cf5831bb771cbf9f94b156c5ff7c683b5 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Wed, 15 Oct 2025 09:52:01 +1100 Subject: [PATCH 08/15] split distributed ci --- .buildkite/pipeline.yml | 15 +++++++++++++-- test/runtests.jl | 5 ++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 50c9206b6..874ba3721 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -196,10 +196,21 @@ steps: slurm_ntasks: 1 slurm_gpus_per_task: 1 - - label: "Run distributed tests" + - label: "Run distributed utils tests" key: "test_distributed" env: - TEST_GROUP: "distributed" + TEST_GROUP: "distributed_utils" + commands: + - "srun julia --project -e 'using Pkg; Pkg.test()'" + agents: + slurm_mem: 10G + slurm_cpus_per_task: 1 + slurm_ntasks: 4 + + - label: "Run distributed models tests" + key: "test_distributed" + env: + TEST_GROUP: "distributed_models" commands: - "srun julia --project -e 'using Pkg; Pkg.test()'" agents: diff --git a/test/runtests.jl b/test/runtests.jl index 0b77a1eef..cecd722a0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -109,8 +109,11 @@ if test_group == :ocean_sea_ice_model || test_group == :all include("test_diagnostics.jl") end -if test_group == :distributed || test_group == :all +if test_group == :distributed_utils || test_group == :all include("test_distributed_utils.jl") +end + +if test_group == :distributed_models || test_group == :all include("test_distributed_models.jl") end From 53fa1f115e47f8b83bddf52b01e976b074878735 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Wed, 15 Oct 2025 10:20:08 +1100 Subject: [PATCH 09/15] fix pipeline --- .buildkite/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 874ba3721..34225e352 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -197,7 +197,7 @@ steps: slurm_gpus_per_task: 1 - label: "Run distributed utils tests" - key: "test_distributed" + key: "test_distributed_utils" env: TEST_GROUP: "distributed_utils" commands: @@ -208,7 +208,7 @@ steps: slurm_ntasks: 4 - label: "Run distributed models tests" - key: "test_distributed" + key: "test_distributed_models" env: TEST_GROUP: "distributed_models" commands: From 0df5d2fc7ea5bf6bdd34da89cd0174d1689e5803 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Wed, 15 Oct 2025 11:58:25 +1100 Subject: [PATCH 10/15] give slurm_gpus: 2 to distributed_models --- .buildkite/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 34225e352..a31d13238 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -215,6 +215,7 @@ steps: - "srun julia --project -e 'using Pkg; Pkg.test()'" agents: slurm_mem: 10G + slurm_gpus: 2 slurm_cpus_per_task: 1 slurm_ntasks: 4 From bd40a2b5479d0e39526633a0e29a94434f06810e Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Wed, 15 Oct 2025 11:03:57 +0200 Subject: [PATCH 11/15] Update resource allocation in pipeline configuration --- .buildkite/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a31d13238..0d8ba83c5 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -215,8 +215,8 @@ steps: - "srun julia --project -e 'using Pkg; Pkg.test()'" agents: slurm_mem: 10G - slurm_gpus: 2 - slurm_cpus_per_task: 1 + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 8 slurm_ntasks: 4 - wait: ~ From e8962c75312221c49525eea16c1ef0133f68940a Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Thu, 16 Oct 2025 09:03:13 +1100 Subject: [PATCH 12/15] Update Oceananigans version to 0.100.5 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d2923508a..d7b576e84 100644 --- a/Project.toml +++ b/Project.toml @@ -57,7 +57,7 @@ KernelAbstractions = "0.9" MPI = "0.20" MeshArrays = "0.3" NCDatasets = "0.12, 0.13, 0.14" -Oceananigans = "0.100.4" +Oceananigans = "0.100.5" OffsetArrays = "1.14" PrecompileTools = "1" Reactant = "0.2.45" From fa8f5ff4080971e1b1b0bfdc52aa9ba2a0b1b053 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Mon, 27 Oct 2025 08:59:37 +0100 Subject: [PATCH 13/15] Update memory allocation to 100G for tests Increased memory allocation for distributed models test. --- .buildkite/pipeline.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0d8ba83c5..591cc0049 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -214,9 +214,8 @@ steps: commands: - "srun julia --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 10G + slurm_mem: 100G # Why would we need so much memory? slurm_gpus_per_task: 1 - slurm_cpus_per_task: 8 slurm_ntasks: 4 - wait: ~ From a635556931f3422a3a5069303057180ca4d6e4dc Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Mon, 27 Oct 2025 11:07:48 +0100 Subject: [PATCH 14/15] Update archs list to include only CPU architecture Comment out GPU architecture in archs list and add TODO note. --- test/test_distributed_models.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index 3cb0ca68e..ac488c63c 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -12,8 +12,11 @@ using Dates using ClimaSeaIce using ClimaSeaIce.SeaIceThermodynamics: IceWaterThermalEquilibrium -archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true), - Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)] +# TODO: add a distributed GPU architecture to the list of archs... Requires making sure CUDA-aware MPI is enabled +archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)] + +# archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true), +# Distributed(GPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)] function analytical_immersed_tripolar_grid(underlying_grid::TripolarGrid; radius = 5, # degrees From 1f086919547faea5feaf8f1cc3bcb356830f1423 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Tue, 28 Oct 2025 09:16:47 +0100 Subject: [PATCH 15/15] Apply suggestion from @simone-silvestri --- test/test_distributed_models.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl index ac488c63c..779024394 100644 --- a/test/test_distributed_models.jl +++ b/test/test_distributed_models.jl @@ -9,8 +9,6 @@ using Oceananigans.Units using Oceananigans.DistributedComputations using Oceananigans.Architectures: on_architecture using Dates -using ClimaSeaIce -using ClimaSeaIce.SeaIceThermodynamics: IceWaterThermalEquilibrium # TODO: add a distributed GPU architecture to the list of archs... Requires making sure CUDA-aware MPI is enabled archs = [Distributed(CPU(); partition = Partition(y = DistributedComputations.Equal()), synchronized_communication=true)]