-
Notifications
You must be signed in to change notification settings - Fork 79
Open
Description
This code does a memcopy from src to dst: (each thread loads and stores Nitem consecutive elements)
@kernel function copy_kernel!(A::AbstractVector{T}, B::AbstractVector{T}, ::Val{Nitem}) where {Nitem,T}
I = @index(Global)
idx_base = (I - 1) * Nitem
N = length(B)
local values::NTuple{Nitem,T}
remaining = N - idx_base
values = ntuple(i -> i <= remaining ? @inbounds(B[idx_base+i]) : zero(T), Val(Nitem))
#@synchronize()
for i in 1:Nitem
if idx_base + i <= N
@inbounds A[idx_base+i] = values[i]
end
end
end
n = 1000_00000
T = Float32
Nitem = 3
src_host = CuArray{T}(1:n)
src = CuArray(src_host)
dst = similar(src)
backend = get_backend(src)
ndrange = cld(length(src), Nitem)
CUDA.@sync copy_kernel!(backend)(dst, src, Val(Nitem); ndrange=ndrange)
However, if I uncomment the @synchronize() line, then it does not compile anymore. Notice that in the CUDA backend, replacing @synchronize() by sync_threads does work.
By the way, after some profiling, the above kernel for memcopy (without sync) looks faster than copy!() kernel from CUDA.jl, if we tune Nitem in function of the GPU architecture and with the size of n.
Metadata
Metadata
Assignees
Labels
No labels