jipolanco · jipolanco · Feb 3, 2026 · Jan 30, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/Project.toml b/Project.toml
@@ -36,18 +36,26 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
+OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 
 [extensions]
+VortexPastaAMDGPUExt = ["AMDGPU"]
+VortexPastaCUDAExt = ["CUDA"]
 VortexPastaMakieExt = ["Makie"]
+VortexPastaOpenCLExt = ["OpenCL"]
 
 [compat]
+AMDGPU = "2.2"
 AbstractFFTs = "1.3"
 AcceleratedKernels = "0.4.3"
 Accessors = "0.1.39"
 Adapt = "4.0.4"
 Atomix = "1.1.2"
 Bumper = "0.4, 0.5, 0.6, 0.7"
+CUDA = "5.9"
 CommonSolve = "0.2.4"
 FFTW = "1.6"
 FastGaussQuadrature = "1"
@@ -62,6 +70,7 @@ LinearAlgebra = "1.9"
 Makie = "0.24"
 NonuniformFFTs = "0.8.2, 0.9"
 OhMyThreads = "0.7.0, 0.8"
+OpenCL = "0.10"
 PrecompileTools = "1"
 Random = "1.10"
 SIMD = "3.7.1"

diff --git a/docs/src/modules/BiotSavart.md b/docs/src/modules/BiotSavart.md
@@ -120,6 +120,7 @@ activate_device!
 ```@docs
 AbstractBackend
 BiotSavartCache
+HostVector
 background_vorticity_correction!
 process_point_charges!
 copy_output_values_on_nodes!

diff --git a/ext/VortexPastaAMDGPUExt.jl b/ext/VortexPastaAMDGPUExt.jl
@@ -0,0 +1,30 @@
+module VortexPastaAMDGPUExt
+
+using AMDGPU: AMDGPU, HIP, ROCBackend
+using VortexPasta.BiotSavart
+
+# See:
+# - VortexPastaCUDAExt.jl
+# - https://github.com/JuliaGPU/AMDGPU.jl/blob/4d9bff975a46d7867b23c0d1165b05f426dd42ab/src/runtime/memory/hip.jl#L197
+# - https://rocm.docs.amd.com/projects/HIP/en/docs-7.2.0/doxygen/html/group___memory.html#gab8258f051e1a1f7385f794a15300e674
+function BiotSavart.pagelock!(::ROCBackend, A::DenseArray{T}) where {T}
+    @debug "Pinning CPU array using AMDGPU"
+    ptr = pointer(A)
+    bytesize = length(A) * sizeof(T)
+    flags = HIP.hipHostRegisterPortable  # "Memory is considered registered by all contexts. HIP only supports one context so this is always assumed true."
+    if !isempty(A)
+        HIP.hipHostRegister(ptr, bytesize, flags)
+    end
+    nothing
+end
+
+function BiotSavart.unpagelock!(::ROCBackend, A::DenseArray{T}) where {T}
+    @debug "Unpinning CPU array using AMDGPU"
+    ptr = pointer(A)
+    if !isempty(A)
+        HIP.hipHostUnregister(ptr)
+    end
+    nothing
+end
+
+end
diff --git a/ext/VortexPastaCUDAExt.jl b/ext/VortexPastaCUDAExt.jl
@@ -0,0 +1,28 @@
+module VortexPastaCUDAExt
+
+using CUDA: CUDA, CUDABackend
+using VortexPasta.BiotSavart
+
+# Adapted from https://github.com/JuliaGPU/CUDA.jl/blob/e2eab84e89085b5298d44634858993939479f0ac/lib/cudadrv/memory.jl#L167
+# See also https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+function BiotSavart.pagelock!(::CUDABackend, A::DenseArray{T}) where {T}
+    @debug "Pinning CPU array using CUDA"
+    ptr = pointer(A)
+    bytesize = length(A) * sizeof(T)
+    flags = CUDA.MEMHOSTREGISTER_PORTABLE  # "The memory returned by this call will be considered as pinned memory by all CUDA contexts, not just the one that performed the allocation."
+    if !isempty(A)
+        CUDA.cuMemHostRegister_v2(ptr, bytesize, flags)
+    end
+    nothing
+end
+
+function BiotSavart.unpagelock!(::CUDABackend, A::DenseArray{T}) where {T}
+    @debug "Unpinning CPU array using CUDA"
+    ptr = pointer(A)
+    if !isempty(A)
+        CUDA.cuMemHostUnregister(ptr)
+    end
+    nothing
+end
+
+end
diff --git a/ext/VortexPastaOpenCLExt.jl b/ext/VortexPastaOpenCLExt.jl
@@ -0,0 +1,24 @@
+module VortexPastaOpenCLExt
+
+using OpenCL: CLArray
+using VortexPasta.BiotSavart: BiotSavart, HostVector
+
+# unsafe_copyto! is not defined between Ptr and CLPtr, so we use unsafe_wrap instead to work
+# with Array and CLArray.
+function BiotSavart.copyto_ptr!(dst::CLArray, src::HostVector, n)
+    @debug "Copying HostVector -> CLArray"
+    GC.@preserve dst src begin
+        A = unsafe_wrap(Array, pointer(src), (n,); own = false)
+        copyto!(dst, A)
+    end
+end
+
+function BiotSavart.copyto_ptr!(dst::HostVector, src::CLArray, n)
+    @debug "Copying CLArray -> HostVector"
+    GC.@preserve dst src begin
+        A = unsafe_wrap(Array, pointer(dst), (n,); own = false)
+        copyto!(A, src)
+    end
+end
+
+end
diff --git a/src/BiotSavart/BiotSavart.jl b/src/BiotSavart/BiotSavart.jl
@@ -66,6 +66,7 @@ const VectorOfVelocities = VectorOfVec
 const AllFilamentVelocities = AbstractVector{<:VectorOfVelocities}
 
 include("ka_utils.jl")
+include("host_device_transfers.jl")
 include("pointdata.jl")
 include("types_shortrange.jl")
 include("types_longrange.jl")
@@ -397,32 +398,6 @@ function compute_on_nodes!(
     fields
 end
 
-function copy_host_to_device!(dst::AbstractVector, src::AbstractVector, N = length(src))
-    resize_no_copy!(dst, N)
-    backend = KA.get_backend(dst)
-    if KA.device(backend) == 1
-        # CUDA: apparently we can't pagelock the same CPU array from multiple CUDA devices,
-        # so we just do it if we're on device 1.
-        KA.pagelock!(backend, src)
-    end
-    KA.copyto!(backend, dst, src)
-    nothing
-end
-
-function copy_host_to_device!(dst::StructVector, src::StructVector, N = length(src))
-    foreach(StructArrays.components(dst), StructArrays.components(src)) do a, b
-        copy_host_to_device!(a, b, N)
-    end
-    nothing
-end
-
-function copy_host_to_device!(dst::Tuple, src::Tuple, N = length(first(src)))
-    foreach(dst, src) do a, b
-        copy_host_to_device!(a, b, N)
-    end
-    nothing
-end
-
 function do_longrange!(
         cache::LongRangeCache, outputs::NamedTuple, pointdata_cpu;
         callback_vorticity::Fvort,
@@ -444,9 +419,9 @@ function do_longrange!(
 
             @timeit to "Copy point charges (host -> device)" begin
                 # Only copy fields needed for long-range computations
-                copy_host_to_device!(pointdata.nodes, pointdata_cpu.nodes)
-                copy_host_to_device!(pointdata.points, pointdata_cpu.points)
-                copy_host_to_device!(pointdata.charges, pointdata_cpu.charges)
+                copy_host_to_device!(pointdata.nodes, pointdata_cpu.nodes, pointdata.buf_host)
+                copy_host_to_device!(pointdata.points, pointdata_cpu.points, pointdata.buf_host)
+                copy_host_to_device!(pointdata.charges, pointdata_cpu.charges, pointdata.buf_host)
             end
             @timeit to "Process point charges" process_point_charges!(cache)  # modifies pointdata (points and nodes)
 
@@ -497,13 +472,13 @@ function do_shortrange!(cache::ShortRangeCache, outputs::NamedTuple, pointdata_c
         GC.@preserve pointdata begin  # see docs for KA.copyto! (it shouldn't really be needed here)
             # Resize output arrays
             noutputs = length(pointdata_cpu.nodes)
-            foreach(v -> resize_no_copy!(v, noutputs), outputs)  # resize output arrays
+            foreach(v -> resize_no_copy!(v, noutputs), outputs)
 
             if LIA === Val(true) || LIA === Val(:only)
                 @timeit to "Copy point charges (host -> device)" begin
                     # For now, only copy what we need for local term
-                    copy_host_to_device!(pointdata.derivatives_on_nodes, pointdata_cpu.derivatives_on_nodes)  # needed for local term (LIA)
-                    copy_host_to_device!(pointdata.subsegment_lengths, pointdata_cpu.subsegment_lengths)      # needed for local term (LIA)
+                    copy_host_to_device!(pointdata.derivatives_on_nodes, pointdata_cpu.derivatives_on_nodes, pointdata.buf_host)  # needed for local term (LIA)
+                    copy_host_to_device!(pointdata.subsegment_lengths, pointdata_cpu.subsegment_lengths, pointdata.buf_host)      # needed for local term (LIA)
                 end
                 @timeit to "Local term (LIA)" compute_local_term!(outputs, cache)  # NOTE: this function _replaces_ old values, so it must be called first
             else
@@ -512,10 +487,10 @@ function do_shortrange!(cache::ShortRangeCache, outputs::NamedTuple, pointdata_c
 
             if LIA !== Val(:only)
                 @timeit to "Copy point charges (host -> device)" begin
-                    copy_host_to_device!(pointdata.nodes, pointdata_cpu.nodes)
-                    copy_host_to_device!(pointdata.node_idx_prev, pointdata_cpu.node_idx_prev)  # needed in remove_self_interaction! (and in add_pair_interactions! if avoid_explicit_erf = false)
-                    copy_host_to_device!(pointdata.points, pointdata_cpu.points)
-                    copy_host_to_device!(pointdata.charges, pointdata_cpu.charges)
+                    copy_host_to_device!(pointdata.nodes, pointdata_cpu.nodes, pointdata.buf_host)
+                    copy_host_to_device!(pointdata.node_idx_prev, pointdata_cpu.node_idx_prev, pointdata.buf_host_int)  # needed in remove_self_interaction! (and in add_pair_interactions! if avoid_explicit_erf = false)
+                    copy_host_to_device!(pointdata.points, pointdata_cpu.points, pointdata.buf_host)
+                    copy_host_to_device!(pointdata.charges, pointdata_cpu.charges, pointdata.buf_host)
                 end
                 @timeit to "Process point charges" process_point_charges!(cache)   # useful in particular for cell lists
                 @timeit to "Pair interactions" add_pair_interactions!(outputs, cache)
@@ -555,6 +530,7 @@ function _compute_on_nodes!(
     tasks = Task[]
 
     if with_longrange
+        @debug "Computing long-range interactions asynchronously" KA.get_backend(cache.longrange)
         let cache = cache.longrange
             # Select elements of outputs with the same names as in `fields` (in this case :velocity and/or :streamfunction).
             local outputs = NamedTuple{keys(fields)}(cache.outputs)
@@ -570,6 +546,7 @@ function _compute_on_nodes!(
     end
 
     if with_shortrange
+        @debug "Computing short-range interactions asynchronously" KA.get_backend(cache.shortrange)
         let cache = cache.shortrange
         # Select elements of outputs with the same names as in `fields` (in this case :velocity and/or :streamfunction).
             local outputs = NamedTuple{keys(fields)}(cache.outputs)
@@ -586,6 +563,7 @@ function _compute_on_nodes!(
 
     # Perform CPU-only operations associated to the short-range part (this differentiation is kind of arbitrary).
     if with_shortrange && LIA !== Val(:only)
+        @debug "Computing synchronous operations"
         @timeit to "CPU-only operations (synchronous)" begin
             if params.shortrange.lia_segment_fraction !== nothing
                 @timeit to "Add local integrals" add_local_integrals!(fields, cache, fs)
@@ -601,17 +579,17 @@ function _compute_on_nodes!(
     while ntasks > 0
         @timeit to "Wait for async task to finish" begin
             taskname = take!(channel)::Symbol  # wait for first async task to finish
+            @debug "Task finished: $taskname"
             ntasks -= 1
         end
         # Add results from asynchronous task.
         @timeit to "Copy output (device -> host)" let
-            local buf_cpu = pointdata.nodes  # used as a temporary CPU buffer in GPU->CPU transfers (it already has the right size!)
             if taskname == :shortrange
                 timer = cache.shortrange.to
-                _add_output_from_async_task!(fields, cache.shortrange.outputs, buf_cpu)
+                _add_output_from_async_task!(fields, cache.shortrange.outputs, cache.shortrange.pointdata.buf_host)
             elseif taskname == :longrange
                 timer = cache.longrange.to
-                _add_output_from_async_task!(fields, cache.longrange.outputs, buf_cpu)
+                _add_output_from_async_task!(fields, cache.longrange.outputs, cache.longrange.pointdata.buf_host)
             end
             # Add timings from asynchronous computations.
             TimerOutputs.merge!(to, timer; tree_point = tree_point_base)  # https://github.com/KristofferC/TimerOutputs.jl/issues/143
@@ -623,18 +601,18 @@ function _compute_on_nodes!(
     nothing
 end
 
-function _add_output_from_async_task!(fields, outputs, buf_cpu)
+function _add_output_from_async_task!(fields, outputs, buf_host::HostVector)
     if hasproperty(fields, :streamfunction)
-        copy_output_values_on_nodes!(+, fields.streamfunction, outputs.streamfunction, buf_cpu)
+        copy_output_values_on_nodes!(+, fields.streamfunction, outputs.streamfunction, buf_host)
     end
     if hasproperty(fields, :velocity)
-        copy_output_values_on_nodes!(+, fields.velocity, outputs.velocity, buf_cpu)
+        copy_output_values_on_nodes!(+, fields.velocity, outputs.velocity, buf_host)
     end
     nothing
 end
 
 """
-    copy_output_values_on_nodes!([op::Function], vs::AbstractVector, vs_d::AbstractVector, [vs_h::AbstractVector])
+    copy_output_values_on_nodes!([op::Function], vs::AbstractVector, vs_d::AbstractVector, [buf::HostVector])
 
 Copy computed values onto a vector of vectors.
 
@@ -651,40 +629,65 @@ Copy computed values onto a vector of vectors.
   "linear" vector, meaning that there is a single vector of quantities (`Vec3`) for all filament nodes across all filaments.
    Note that this vector may also be on a GPU device (`_d` = device).
 
-- `vs_h`: this is an optional CPU vector (`_h` = host), which may be passed if `vs_d` is _not_ a CPU
+- `buf`: this is an optional CPU vector in the form of a [`HostVector`](@ref), which may be passed if `vs_d` is _not_ a CPU
   array. In that case, it will be used as an intermediate buffer array, and thus passing it may help reduce CPU allocations.
 """
 function copy_output_values_on_nodes!(
-        op::F, vs::AbstractVector, vs_d::AbstractVector, vs_h = nothing,
+        op::F, vs::AbstractVector, vs_d::AbstractVector, buf::Union{Nothing, HostVector} = nothing,
     ) where {F <: Function}
     backend = KA.get_backend(vs_d)
-    _copy_output_values_on_nodes!(backend, op, vs, vs_d, vs_h)
+    T = Filaments.number_type(vs_d)
+    n = length(vs_d)
+    buf_host = if backend isa CPU
+        nothing  # we don't need a buffer if doing CPU -> CPU copy
+    elseif buf === nothing
+        HostVector{T}(undef, backend, n)
+    else
+        @assert backend == KA.get_backend(buf)  # make sure it's already associated to the right GPU backend
+        @assert buf isa HostVector{T}  # make sure it has the right element type
+        resize_no_copy!(buf, n)
+        buf
+    end
+    _copy_output_values_on_nodes!(backend, op, vs, vs_d, buf_host)
 end
 
 function copy_output_values_on_nodes!(vs::AbstractVector, vs_d::AbstractVector, args...)
     op(old, new) = new
     copy_output_values_on_nodes!(op, vs, vs_d, args...)
 end
 
-function _copy_output_values_on_nodes!(backend::GPU, op::F, vs, vs_d, ::Nothing) where {F}
-    _copy_output_values_on_nodes!(backend, op, vs, vs_d, adapt(CPU(), vs))  # this allocates a new CPU array!
+function _copy_output_values_on_nodes!(backend::GPU, op::F, vs::AllFilamentVelocities, vs_d::StructVector, buf_host::HostVector) where {F}
+    for (i, us) in enumerate(StructArrays.components(vs_d))
+        _copy_output_values_on_nodes!(backend, op, vs, i, us, buf_host)  # copy i-th component
+    end
+    vs
 end
 
-function _copy_output_values_on_nodes!(backend::GPU, op::F, vs, vs_d, vs_h::AbstractVector) where {F}
+function _copy_output_values_on_nodes!(backend::GPU, op::F, vs::AllFilamentVelocities, i::Int, vs_d::AbstractVector{T}, buf_host::HostVector{T}) where {F, T}
     # Perform GPU -> CPU copy (device-to-host)
-    resize_no_copy!(vs_h, length(vs_d))
-    # KA.copyto!(backend, vs_h[i], vs_d[i])  # may fail on CUDA due to pinning of CPU memory (https://github.com/JuliaGPU/CUDA.jl/issues/2594)
-    copyto!(vs_h, vs_d)
-    KA.synchronize(backend)  # make sure we're done copying data to CPU (may be needed on CUDA, where KA.copyto! is asynchronous)
-    _copy_output_values_on_nodes!(CPU(), op, vs, vs_h)  # finally, perform CPU -> CPU copy
+    @assert backend == KA.get_backend(buf_host)
+    copy_device_to_host!(buf_host, vs_d)
+    _copy_output_values_on_nodes!(CPU(), op, vs, i, buf_host)  # finally, perform CPU -> CPU copy
 end
 
 # CPU -> CPU copy (change of vector "format")
-function _copy_output_values_on_nodes!(::CPU, op::F, vs::AbstractVector, vs_h::AbstractVector, ignored = nothing) where {F}
+function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, vs_h::AbstractVector, ignored = nothing) where {F}
     n = 0
     @inbounds for vf in vs, j in eachindex(vf)
         q = vs_h[n += 1]
-        vf[j] = op(vf[j], q)  # typically op == +, meaning that we add to the previous value
+        vf[j] = @inline op(vf[j], q)  # typically op == +, meaning that we add to the previous value
+    end
+    vs
+end
+
+# Same as above, but for a single component `i`.
+function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, i::Int, vs_h::HostVector{T}, ignored = nothing) where {F, T}
+    n = 0
+    @inbounds for vf in vs, j in eachindex(vf)
+        q = vs_h[n += 1]::T
+        v = vf[j]
+        vi_new = @inline op(v[i], q)  # typically op == +, meaning that we add to the previous value
+        vf[j] = Base.setindex(v, vi_new, i)
     end
     vs
 end