Merge pull request #10 from JuliaGPU/vc/streamgc

vchuravy · web-flow · commit 50efa419a189 · 2020-02-08T00:22:25.000-05:00
add stream GC and wait with progress function
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Valentin Churavy <v.churavy@gmail.com>"]
 version = "0.1.0"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
 CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
@@ -12,13 +13,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
-CUDAapi = ">= 1.1"
-CUDAdrv = ">= 4.0"
-CUDAnative = ">= 2.2"
+Adapt = "0.4, 1.0"
+CUDAapi = "3.0"
+CUDAdrv = "6.0"
+CUDAnative = "2.10"
+Cassette = "0.3"
 Requires = "1.0"
-julia = ">= 1.3"
-Cassette = ">= 0.3"
 StaticArrays = "0.12"
+julia = "1.3"
 
 [extras]
 CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -105,7 +105,9 @@ end
    @synchronize()
 """
 macro synchronize()
-    @error "@synchronize not captured or used outside @kernel"
+    quote
+        $__synchronize()
+    end
 end
 
 """
@@ -286,6 +288,10 @@ function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     throw(MethodError(ScratchArray, (T, Val(Dims), Val(Id))))
 end
 
+function __synchronize()
+    error("@synchronize used outside kernel or not captured")
+end
+
 ###
 # Backends/Implementation
 ###
diff --git a/src/backends/cpu.jl b/src/backends/cpu.jl
@@ -2,8 +2,15 @@ struct CPUEvent <: Event
     task::Core.Task
 end
 
-function wait(ev::CPUEvent)
-    wait(ev.task)
+function wait(ev::CPUEvent, progress=nothing)
+    if progress === nothing
+        wait(ev.task)
+    else
+        while !Base.istaskdone(ev.task)
+            progress()
+            yield() # yield to the scheduler
+        end
+    end
 end
 
 function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, dependencies=nothing)
diff --git a/src/backends/cuda.jl b/src/backends/cuda.jl
@@ -2,36 +2,61 @@ import CUDAnative, CUDAdrv
 import CUDAnative: cufunction
 import CUDAdrv: CuEvent, CuStream, CuDefaultStream
 
-STREAMS = CuStream[]
-let id = 1
-    global next_stream
-    function next_stream()
-        global id
-        stream = STREAMS[id]
-        if id < length(STREAMS)
-            id += 1
-        else
-            id = 1
+const FREE_STREAMS = CuStream[]
+const STREAMS = CuStream[]
+const STREAM_GC_THRESHOLD = Ref{Int}(16)
+
+@init begin
+    if haskey(ENV, "KERNELABSTRACTIONS_STREAMS_GC_THRESHOLD")
+        global STREAM_GC_THRESHOLD[] = parse(Int, ENV["KERNELABSTRACTIONS_STREAMS_GC_THRESHOLD"])
+    end
+
+end
+
+## Stream GC
+# Simplistic stream gc design in which when we have a total number
+# of streams bigger than a threshold, we start scanning the streams
+# and add them back to the freelist if all work on them has completed.
+# Alternative designs:
+# - Enqueue a host function on the stream that adds the stream back to the freelist
+# - Attach a finalizer to events that adds the stream back to the freelist
+# Possible improvements
+# - Add a background task that occasionally scans all streams
+# - Add a hysterisis by checking a "since last scanned" timestamp
+# - Add locking
+function next_stream()
+    if !isempty(FREE_STREAMS)
+        return pop!(FREE_STREAMS)
+    end
+
+    if length(STREAMS) > STREAM_GC_THRESHOLD[]
+        for stream in STREAMS
+            if CUDAdrv.query(stream)
+                push!(FREE_STREAMS, stream)
+            end
         end
     end
+
+    if !isempty(FREE_STREAMS)
+        return pop!(FREE_STREAMS)
+    end
+
+    stream = CUDAdrv.CuStream(CUDAdrv.STREAM_NON_BLOCKING)
+    push!(STREAMS, stream)
+    return stream
 end
 
 struct CudaEvent <: Event
     event::CuEvent
 end
-function wait(ev::CudaEvent)
-    # TODO: MPI/libuv progress
-    CUDAdrv.wait(ev.event)
-end
-
-@init begin
-    if haskey(ENV, "KERNELABSTRACTIONS_STREAMS")
-        nstreams = parse(Int, ENV["KERNELABSTRACTIONS_STREAMS"])
+function wait(ev::CudaEvent, progress=nothing)
+    if progress === nothing
+        CUDAdrv.wait(ev.event)
     else
-        nstreams = 4
-    end
-    for i in 1:nstreams
-        push!(STREAMS, CuStream(CUDAdrv.STREAM_NON_BLOCKING))
+        while !CUDAdrv.query(ev.event)
+            progress()
+            # do we need to `yield` here?
+        end
     end
 end
 
@@ -43,13 +68,7 @@ function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, wor
         dependencies = (dependencies,)
     end
 
-    # Be conservative and launch on CuDefaultStream
-    if dependencies === nothing
-        stream = CuDefaultStream()
-    else
-        stream = next_stream()
-    end
-
+    stream = next_stream()
     if dependencies !== nothing
         for event in dependencies
             @assert event isa CudaEvent
@@ -182,7 +201,7 @@ end
 ###
 # GPU implementation of shared memory
 ###
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = CUDAnative._shmem(Val(Id), T, Val(prod(Dims)))
     CUDAnative.CuDeviceArray(Dims, CUDAnative.DevicePtr{T, CUDAnative.AS.Shared}(ptr))
 end
@@ -192,6 +211,10 @@ end
 # - private memory for each workitem
 ###
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(Scratchpad), ::Type{T}, ::Val{Dims}) where {T, Dims}
+@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(Scratchpad), ::Type{T}, ::Val{Dims}) where {T, Dims}
     MArray{__size(Dims), T}(undef)
 end
+
+@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__synchronize))
+    CUDAnative.sync_threads()
+end
diff --git a/src/macros.jl b/src/macros.jl
@@ -88,7 +88,7 @@ function split(stmts)
 
     for stmt in stmts.args
         if isexpr(stmt, :macrocall) && stmt.args[1] === Symbol("@synchronize")
-            push!(loops, (current, copy(indicies), allocations))
+            push!(loops, (current, deepcopy(indicies), allocations))
             allocations = Any[]
             current     = Any[]
             continue