diff --git a/src/BiotSavart/BiotSavart.jl b/src/BiotSavart/BiotSavart.jl index 131984f07..9372475ba 100644 --- a/src/BiotSavart/BiotSavart.jl +++ b/src/BiotSavart/BiotSavart.jl @@ -672,22 +672,30 @@ end # CPU -> CPU copy (change of vector "format") function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, vs_h::AbstractVector, ignored = nothing) where {F} - n = 0 - @inbounds for vf in vs, j in eachindex(vf) - q = vs_h[n += 1] - vf[j] = @inline op(vf[j], q) # typically op == +, meaning that we add to the previous value + @sync for chunk in FilamentChunkIterator(vs) + Threads.@spawn for (i, inds, n) in chunk + vf = vs[i] + @inbounds for j in inds + q = vs_h[n += 1] + vf[j] = @inline op(vf[j], q) # typically op == +, meaning that we add to the previous value + end + end end vs end # Same as above, but for a single component `i`. function _copy_output_values_on_nodes!(::CPU, op::F, vs::AllFilamentVelocities, i::Int, vs_h::HostVector{T}, ignored = nothing) where {F, T} - n = 0 - @inbounds for vf in vs, j in eachindex(vf) - q = vs_h[n += 1]::T - v = vf[j] - vi_new = @inline op(v[i], q) # typically op == +, meaning that we add to the previous value - vf[j] = Base.setindex(v, vi_new, i) + @sync for chunk in FilamentChunkIterator(vs) + Threads.@spawn for (filament_idx, inds, n) in chunk + vf = vs[filament_idx] + @inbounds for j in inds + q = vs_h[n += 1]::T + v = vf[j] + vi_new = @inline op(v[i], q) # typically op == +, meaning that we add to the previous value + vf[j] = Base.setindex(v, vi_new, i) + end + end end vs end diff --git a/src/BiotSavart/host_device_transfers.jl b/src/BiotSavart/host_device_transfers.jl index f86df2c6e..811a8fbf3 100644 --- a/src/BiotSavart/host_device_transfers.jl +++ b/src/BiotSavart/host_device_transfers.jl @@ -92,7 +92,6 @@ end # package extension. copyto_ptr!(dst, src, n) = unsafe_copyto!(pointer(dst), pointer(src), n) -# TODO: parallelise copy when src is on the CPU? function Base.copyto!(v::HostVector{T}, src::DenseArray{T}) where {T} n = length(src) @assert length(v) == n # already resized @@ -100,9 +99,19 @@ function Base.copyto!(v::HostVector{T}, src::DenseArray{T}) where {T} GC.@preserve src v begin copyto_ptr!(v, src, n) end + v +end + +# Specialisation for CPU -> CPU case +function Base.copyto!(v::HostVector{T}, src::Array{T}) where {T} + n = length(src) + @assert length(v) == n # already resized + Threads.@threads for i in eachindex(src, v) + @inbounds v[i] = src[i] + end + v end -# TODO: parallelise copy when dst is on the CPU? function Base.copyto!(dst::DenseArray{T}, v::HostVector{T}) where {T} n = length(v) @assert length(dst) == n # already resized @@ -110,6 +119,17 @@ function Base.copyto!(dst::DenseArray{T}, v::HostVector{T}) where {T} GC.@preserve dst v begin copyto_ptr!(dst, v, n) end + dst +end + +# Specialisation for CPU -> CPU case +function Base.copyto!(dst::Array{T}, v::HostVector{T}) where {T} + n = length(v) + @assert length(dst) == n # already resized + Threads.@threads for i in eachindex(dst, v) + @inbounds dst[i] = v[i] + end + dst end ## ========================================================================================== ## @@ -128,7 +148,9 @@ end function copy_host_to_device!(dst::Vector, src::Vector, ::HostVector) n = length(src) resize_no_copy!(dst, n) - copyto!(dst, src) + Threads.@threads for i in eachindex(dst, src) + @inbounds dst[i] = src[i] + end dst end