Skip to content

Commit eff982d

Browse files
bors[bot]vchuravy
andauthored
Merge #85
85: Use hostcall for wait and stream GC r=vchuravy a=vchuravy While looking at a profile from @lcw I noticed that we spend a lot of CPU cycles on `cuEventQuery`. The second change is more questionable since it might make GPU launches slower (have yet to measure costs). Co-authored-by: Valentin Churavy <[email protected]> Co-authored-by: Valentin Churavy <[email protected]>
2 parents 78cad4e + 7a9071f commit eff982d

File tree

1 file changed

+9
-7
lines changed

1 file changed

+9
-7
lines changed

lib/CUDAKernels/src/CUDAKernels.jl

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,16 @@ import Base: wait
7575

7676
wait(ev::CudaEvent, progress=yield) = wait(CPU(), ev, progress)
7777

78-
function wait(::CPU, ev::CudaEvent, progress=yield)
79-
if progress === nothing
80-
CUDA.synchronize(ev.event)
81-
else
82-
while !isdone(ev)
83-
progress()
84-
end
78+
function wait(::CPU, ev::CudaEvent, progress=nothing)
79+
isdone(ev) && return nothing
80+
81+
event = Base.Threads.Event()
82+
stream = next_stream()
83+
wait(CUDADevice(), ev, nothing, stream)
84+
CUDA.launch(;stream) do
85+
notify(event)
8586
end
87+
wait(event)
8688
end
8789

8890
# Use this to synchronize between computation using the task local stream

0 commit comments

Comments
 (0)