Fix ranks not seeing the correct device (#4827)

simone-silvestri · navidcy · web-flow · commit ddbca8bdc7f9 · 2025-10-02T09:23:41.000+02:00
* fix device!

* couple of fixes

* add a test

* Bump patch release version 0.99.3

* Update test_distributed_architectures.jl

* Modify ROCGPU device function for AMD compatibility

Update device function to use device_id! for AMD devices.

* Update architecture check for CUDAGPU

* remove stray spaces

* fix

* test the node rank

* add communicator = MPI.COMM_WORLD

* Update test_distributed_architectures.jl

---------

Co-authored-by: Navid C. Constantinou &lt;navidcy@users.noreply.github.com&gt;
diff --git a/ext/OceananigansAMDGPUExt.jl b/ext/OceananigansAMDGPUExt.jl
@@ -35,6 +35,7 @@ const ROCGPU = AC.GPU{ROCBackend}
 ROCGPU() = AC.GPU(AMDGPU.ROCBackend())
 
 Base.summary(::ROCGPU) = "ROCGPU"
+AC.device!(::ROCGPU, i) = AMDGPU.device_id!(id+1) # AMD devices are numbered 1..ndevices
 
 AC.architecture(::ROCArray) = ROCGPU()
 AC.architecture(::Type{ROCArray}) = ROCGPU()
diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl
@@ -54,6 +54,7 @@ function UT.versioninfo_with_gpu(::CUDAGPU)
 end
 
 Base.summary(::CUDAGPU) = "CUDAGPU"
+AC.device!(::CUDAGPU, i) = CUDA.device!(i)
 
 AC.architecture(::CuArray) = CUDAGPU()
 AC.architecture(::Type{CuArray}) = CUDAGPU()
diff --git a/src/Architectures.jl b/src/Architectures.jl
@@ -58,9 +58,8 @@ struct ReactantState <: AbstractSerialArchitecture end
 
 device(a::CPU) = KA.CPU()
 device(a::GPU) = a.device
-device!(::CPU, i) = KA.device!(CPU(), i+1)
+device!(::CPU, i) = nothing
 device!(::CPU) = nothing
-device!(a::GPU, i) = KA.device!(a.device, i+1)
 ndevices(a::CPU) = KA.ndevices(KA.CPU())
 ndevices(a::AbstractArchitecture) = KA.ndevices(a.device)
 synchronize(a::CPU) = KA.synchronize(KA.CPU())
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Architectures
 using Oceananigans.Grids: topology, validate_tupled_argument
 
-import Oceananigans.Architectures: device, cpu_architecture, on_architecture, array_type, child_architecture, convert_to_device
+import Oceananigans.Architectures: device, device!, cpu_architecture, on_architecture, array_type, child_architecture, convert_to_device
 import Oceananigans.Grids: zeros
 import Oceananigans.Utils: sync_device!, tupleit
 
diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
@@ -130,7 +130,7 @@ function LatitudeLongitudeGrid(arch::Distributed,
                                z,
                                topology = nothing,
                                radius = R_Earth,
-                               halo = (1, 1, 1))
+                               halo = nothing)
 
     topology, global_sz, halo, latitude, longitude, z, precompute_metrics =
         validate_lat_lon_grid_args(topology, size, halo, FT, latitude, longitude, z, precompute_metrics)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_buffers.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_buffers.jl
@@ -48,7 +48,7 @@ function compute_buffer_tendency_contributions!(grid::DistributedActiveInteriorI
 
         # If the map == nothing, we don't need to compute the buffer because
         # the buffer is not adjacent to a processor boundary
-        !isnothing(map) && compute_hydrostatic_free_surface_tendency_contributions!(model, :xyz; active_cells_map)
+        !isnothing(active_cells_map) && compute_hydrostatic_free_surface_tendency_contributions!(model, :xyz; active_cells_map)
     end
 
     return nothing
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
@@ -351,7 +351,7 @@ end
 
 # launching with an empty tuple has no effect
 @inline function launch!(arch, grid, workspec_tuple::Tuple{}, kernel, args...; kwargs...)
-    @warn "trying to launch kernel $kernel! with workspec == (). The kernel will not be launched."
+    @warn "trying to launch kernel $kernel with workspec == (). The kernel will not be launched."
     return nothing
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -169,6 +169,7 @@ CUDA.allowscalar() do
         MPI.Initialized() || MPI.Init()
         # In case CUDA is not found, we reset CUDA and restart the julia session
         reset_cuda_if_necessary()
+        include("test_distributed_architectures.jl")
         include("test_distributed_models.jl")
     end
 
@@ -178,7 +179,6 @@ CUDA.allowscalar() do
         reset_cuda_if_necessary()
         include("test_distributed_transpose.jl")
         include("test_distributed_poisson_solvers.jl")
-        include("test_distributed_macros.jl")
     end
 
     if group == :distributed_hydrostatic_model || group == :all
diff --git a/test/test_distributed_architectures.jl b/test/test_distributed_architectures.jl
@@ -1,7 +1,10 @@
+include("dependencies_for_runtests.jl")
+
 using MPI
 using Oceananigans.DistributedComputations
+using CUDA
 
-@testset begin
+@testset "Distributed macros" begin
     rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
     @onrank 0 begin
@@ -57,3 +60,24 @@ using Oceananigans.DistributedComputations
     @onrank split_comm 0 @test a == [1, 3, 5, 7, 9]
     @onrank split_comm 1 @test a == [2, 4, 6, 8, 10]
 end
+
+#=
+@testset "Distributed architectures" begin
+    for arch in test_architectures()
+        child_arch = child_architecture(arch)
+
+        communicator = MPI.COMM_WORLD
+
+        if child_arch isa Oceananigans.Architectures.GPU
+            # Check that no device is the same!
+            local_comm = MPI.Comm_split_type(communicator, MPI.COMM_TYPE_SHARED, arch.local_rank)
+            node_rank  = MPI.Comm_rank(local_comm)
+            device_number = CUDA.device().handle
+            # We are testing on the same node, therefore we can
+            # assume the GPU number changes with the rank
+            @test node_rank == device_number
+        end
+    end
+end
+=#
+