Merge #85

bors[bot] · vchuravy · web-flow · commit eff982d8be20 · 2021-04-22T14:51:21.000Z
85: Use hostcall for wait and stream GC r=vchuravy a=vchuravy While looking at a profile from @lcw I noticed that we spend a lot of CPU cycles on `cuEventQuery`. The second change is more questionable since it might make GPU launches slower (have yet to measure costs). Co-authored-by: Valentin Churavy <v.churavy@gmail.com> Co-authored-by: Valentin Churavy <vchuravy@users.noreply.github.com>
diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -75,14 +75,16 @@ import Base: wait
 
 wait(ev::CudaEvent, progress=yield) = wait(CPU(), ev, progress)
 
-function wait(::CPU, ev::CudaEvent, progress=yield)
-    if progress === nothing
-        CUDA.synchronize(ev.event)
-    else
-        while !isdone(ev)
-            progress()
-        end
+function wait(::CPU, ev::CudaEvent, progress=nothing)
+    isdone(ev) && return nothing
+
+    event = Base.Threads.Event()
+    stream = next_stream()
+    wait(CUDADevice(), ev, nothing, stream)
+    CUDA.launch(;stream) do
+        notify(event)
     end
+    wait(event)
 end
 
 # Use this to synchronize between computation using the task local stream