Skip to content

Commit 3250f1e

Browse files
authored
DtoH copies: perform a nonblocking sync before calling into libcuda. (#2648)
We need to wait for the GPU to finish executing anyway, so could as well maximize the time spent executing Julia code.
1 parent f62af73 commit 3250f1e

File tree

1 file changed

+12
-10
lines changed

1 file changed

+12
-10
lines changed

src/array.jl

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,10 @@ function Base.unsafe_copyto!(dest::DenseCuArray{T}, doffs,
531531
# synchronization here, but the exact cases are hard to know and detect (e.g., unpinned
532532
# memory normally blocks, but not for all sizes, and not on all memory architectures).
533533
GC.@preserve src dest begin
534+
# semantically, it is not safe for this operation to execute asynchronously, because
535+
# the Array may be collected before the copy starts executing. However, when using
536+
# unpinned memory, CUDA first stages a copy to a pinned buffer that will outlive
537+
# the source array, making this operation safe.
534538
unsafe_copyto!(pointer(dest, doffs), pointer(src, soffs), n; async=true)
535539
if Base.isbitsunion(T)
536540
unsafe_copyto!(typetagdata(dest, doffs), typetagdata(src, soffs), n; async=true)
@@ -543,20 +547,18 @@ end
543547
function Base.unsafe_copyto!(dest::Array{T}, doffs,
544548
src::DenseCuArray{T}, soffs, n) where T
545549
context!(context(src)) do
546-
# the copy below may block in `libcuda`; see the note above.
550+
# see comment above; this copy may also block in `libcuda` when dealing with e.g.
551+
# unpinned memory, but even more likely because we need to wait for the GPU to finish
552+
# so that the expected data is available. because of that, eagerly perform a nonblocking
553+
# synchronization first as to maximize the time spent executing Julia code.
554+
synchronize(src)
555+
547556
GC.@preserve src dest begin
548-
# semantically, it is not safe for this operation to execute asynchronously, because
549-
# the Array may be collected before the copy starts executing. However, when using
550-
# unpinned memory, CUDA first stages a copy to a pinned buffer that will outlive
551-
# the source array, making this operation safe.
552-
unsafe_copyto!(pointer(dest, doffs), pointer(src, soffs), n; async=true)
557+
unsafe_copyto!(pointer(dest, doffs), pointer(src, soffs), n; async=false)
553558
if Base.isbitsunion(T)
554-
unsafe_copyto!(typetagdata(dest, doffs), typetagdata(src, soffs), n; async=true)
559+
unsafe_copyto!(typetagdata(dest, doffs), typetagdata(src, soffs), n; async=false)
555560
end
556561
end
557-
558-
# users expect values to be available after this call
559-
synchronize(src)
560562
end
561563
return dest
562564
end

0 commit comments

Comments
 (0)