jipolanco · jipolanco · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/src/BiotSavart/BiotSavart.jl b/src/BiotSavart/BiotSavart.jl
@@ -672,22 +672,30 @@ end
 
 # CPU -> CPU copy (change of vector "format")
 function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, vs_h::AbstractVector, ignored = nothing) where {F}
-    n = 0
-    @inbounds for vf in vs, j in eachindex(vf)
-        q = vs_h[n += 1]
-        vf[j] = @inline op(vf[j], q)  # typically op == +, meaning that we add to the previous value
+    @sync for chunk in FilamentChunkIterator(vs)
+        Threads.@spawn for (i, inds, n) in chunk
+            vf = vs[i]
+            @inbounds for j in inds
+                q = vs_h[n += 1]
+                vf[j] = @inline op(vf[j], q)  # typically op == +, meaning that we add to the previous value
+            end
+        end
     end
     vs
 end
 
 # Same as above, but for a single component `i`.
 function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, i::Int, vs_h::HostVector{T}, ignored = nothing) where {F, T}
-    n = 0
-    @inbounds for vf in vs, j in eachindex(vf)
-        q = vs_h[n += 1]::T
-        v = vf[j]
-        vi_new = @inline op(v[i], q)  # typically op == +, meaning that we add to the previous value
-        vf[j] = Base.setindex(v, vi_new, i)
+    @sync for chunk in FilamentChunkIterator(vs)
+        Threads.@spawn for (filament_idx, inds, n) in chunk
+            vf = vs[filament_idx]
+            @inbounds for j in inds
+                q = vs_h[n += 1]::T
+                v = vf[j]
+                vi_new = @inline op(v[i], q)  # typically op == +, meaning that we add to the previous value
+                vf[j] = Base.setindex(v, vi_new, i)
+            end
+        end
     end
     vs
 end

diff --git a/src/BiotSavart/host_device_transfers.jl b/src/BiotSavart/host_device_transfers.jl
@@ -92,24 +92,44 @@ end
 # package extension.
 copyto_ptr!(dst, src, n) = unsafe_copyto!(pointer(dst), pointer(src), n)
 
-# TODO: parallelise copy when src is on the CPU?
 function Base.copyto!(v::HostVector{T}, src::DenseArray{T}) where {T}
     n = length(src)
     @assert length(v) == n  # already resized
     # This should work both when src is either a CPU or a GPU array.
     GC.@preserve src v begin
         copyto_ptr!(v, src, n)
     end
+    v
+end
+
+# Specialisation for CPU -> CPU case
+function Base.copyto!(v::HostVector{T}, src::Array{T}) where {T}
+    n = length(src)
+    @assert length(v) == n  # already resized
+    Threads.@threads for i in eachindex(src, v)
+        @inbounds v[i] = src[i]
+    end
+    v
 end
 
-# TODO: parallelise copy when dst is on the CPU?
 function Base.copyto!(dst::DenseArray{T}, v::HostVector{T}) where {T}
     n = length(v)
     @assert length(dst) == n  # already resized
     # This should work both when src is either a CPU or a GPU array.
     GC.@preserve dst v begin
         copyto_ptr!(dst, v, n)
     end
+    dst
+end
+
+# Specialisation for CPU -> CPU case
+function Base.copyto!(dst::Array{T}, v::HostVector{T}) where {T}
+    n = length(v)
+    @assert length(dst) == n  # already resized
+    Threads.@threads for i in eachindex(dst, v)
+        @inbounds dst[i] = v[i]
+    end
+    dst
 end
 
 ## ========================================================================================== ##
@@ -128,7 +148,9 @@ end
 function copy_host_to_device!(dst::Vector, src::Vector, ::HostVector)
     n = length(src)
     resize_no_copy!(dst, n)
-    copyto!(dst, src)
+    Threads.@threads for i in eachindex(dst, src)
+        @inbounds dst[i] = src[i]
+    end
     dst
 end