Activate GPU device before resizing arrays (#91)

jipolanco · web-flow · commit ee294bca97a5 · 2026-01-29T13:15:33.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Fixed
+
+- Fix multi-GPU issue on AMDGPU.
+  On AMDGPU (not sure about other backends), when an array on device X is `resize!`d while
+  device Y is activated, the array is then silently "transferred" to device Y. For this
+  reason, we now resize arrays in the same device where they were initially created. This
+  might be a bug in AMDGPU.jl.
+
 ## [0.32.16] - 2026-01-28
 
 ### Changed
diff --git a/src/BiotSavart/BiotSavart.jl b/src/BiotSavart/BiotSavart.jl
@@ -438,6 +438,10 @@ function do_longrange!(
         # Copy point data to the cache (possibly on a GPU).
         @assert pointdata_cpu !== pointdata  # they are different objects
         GC.@preserve pointdata begin  # see docs for KA.copyto! (it shouldn't really be needed here)
+            # Resize output arrays
+            noutputs = length(pointdata_cpu.nodes)
+            foreach(v -> resize_no_copy!(v, noutputs), outputs)
+
             @timeit to "Copy point charges (host -> device)" begin
                 # Only copy fields needed for long-range computations
                 copy_host_to_device!(pointdata.nodes, pointdata_cpu.nodes)
@@ -491,6 +495,10 @@ function do_shortrange!(cache::ShortRangeCache, outputs::NamedTuple, pointdata_c
 
     @timeit to "Short-range component (async)" begin
         GC.@preserve pointdata begin  # see docs for KA.copyto! (it shouldn't really be needed here)
+            # Resize output arrays
+            noutputs = length(pointdata_cpu.nodes)
+            foreach(v -> resize_no_copy!(v, noutputs), outputs)  # resize output arrays
+
             if LIA === Val(true) || LIA === Val(:only)
                 @timeit to "Copy point charges (host -> device)" begin
                     # For now, only copy what we need for local term
@@ -543,15 +551,13 @@ function _compute_on_nodes!(
     # TODO: skip unneeded quantities if LIA === Val(:only) or LIA === Val(false)
     @timeit to "Add point charges" add_point_charges!(pointdata, fs, params)  # done on the CPU
 
-    noutputs = sum(length, fs)  # total number of interpolation points
     channel = Channel{Symbol}(2)  # 2 is the length of the channel (for :shortrange + :longrange)
     tasks = Task[]
 
     if with_longrange
         let cache = cache.longrange
             # Select elements of outputs with the same names as in `fields` (in this case :velocity and/or :streamfunction).
             local outputs = NamedTuple{keys(fields)}(cache.outputs)
-            foreach(v -> resize_no_copy!(v, noutputs), outputs)  # resize output arrays
             # Compute long-range part asynchronously (e.g. on a GPU).
             local task = Threads.@spawn :interactive try
                 do_longrange!(cache, outputs, pointdata; callback_vorticity)
@@ -567,7 +573,6 @@ function _compute_on_nodes!(
         let cache = cache.shortrange
         # Select elements of outputs with the same names as in `fields` (in this case :velocity and/or :streamfunction).
             local outputs = NamedTuple{keys(fields)}(cache.outputs)
-            foreach(v -> resize_no_copy!(v, noutputs), outputs)  # resize output arrays
             # Compute short-range part asynchronously (e.g. on a GPU).
             local task = Threads.@spawn :interactive try
                 do_shortrange!(cache, outputs, pointdata; LIA)
diff --git a/src/BiotSavart/shortrange/cache_common.jl b/src/BiotSavart/shortrange/cache_common.jl
@@ -13,6 +13,10 @@ end
 function ShortRangeCacheCommon(params::ParamsShortRange, pointdata_in::PointData)
     (; backend,) = params
     ka_backend = KA.get_backend(backend)  # CPU, CUDABackend, ROCBackend, ...
+    # Make sure we've activated the device (e.g. GPU id) where short-range computations will
+    # be performed. We need arrays to be allocated in that device.
+    expected_device = KA.device(backend)  # 1, 2, ...
+    @assert KA.device(ka_backend) == expected_device
     pointdata = adapt(ka_backend, pointdata_in)      # create PointData replica on the device if needed
     if pointdata === pointdata_in       # basically if ka_backend isa CPU
         pointdata = copy(pointdata_in)  # make sure pointdata and pointdata_in are not aliased!