Put the launch configuration heuristic in gpu_call.

maleadt · maleadt · commit f6bde2af8a43 · 2020-02-18T08:39:48.000+01:00
This makes it possible to overload without having to do certain work twice
(e.g., converting arguments to their kernel equivalents).
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -42,28 +42,38 @@ function gpu_call(kernel::Base.Callable, args...;
                   threads::Union{Int,Nothing}=nothing,
                   blocks::Union{Int,Nothing}=nothing,
                   name::Union{String,Nothing}=nothing)
-    # determine how many threads/blocks to launch
+    # non-trivial default values for launch configuration
     if total_threads===nothing && threads===nothing && blocks===nothing
         total_threads = length(target)
-    end
-    if total_threads !== nothing
-        if threads !== nothing || blocks !== nothing
-            error("Cannot specify both total_threads and threads/blocks configuration")
-        end
-        blocks, threads = thread_blocks_heuristic(total_threads)
-    else
+    elseif total_threads===nothing
         if threads === nothing
             threads = 1
         end
         if blocks === nothing
             blocks = 1
         end
+    elseif threads!==nothing || blocks!==nothing
+        error("Cannot specify both total_threads and threads/blocks configuration")
+    end
+
+    if total_threads !== nothing
+        gpu_call(backend(target), kernel, args, total_threads; name=name)
+    else
+        gpu_call(backend(target), kernel, args, threads, blocks; name=name)
     end
+end
 
-    gpu_call(backend(target), kernel, args...; threads=threads, blocks=blocks, name=name)
+# gpu_call method with a simple launch configuration heuristic.
+# this can be specialised if more sophisticated heuristics are available.
+function gpu_call(backend::AbstractGPUBackend, kernel, args, total_threads::Int; kwargs...)
+    threads = clamp(total_threads, 1, 256)
+    blocks = max(ceil(Int, total_threads / threads), 1)
+
+    gpu_call(backend, kernel, args, threads, blocks; kwargs...)
 end
 
-gpu_call(backend::AbstractGPUBackend, kernel, args...; kwargs...) = error("Not implemented") # COV_EXCL_LINE
+# bottom-line gpu_call method that is expected to be implemented by the back end
+gpu_call(backend::AbstractGPUBackend, kernel, args, threads::Int, blocks::Int; kwargs...) = error("Not implemented") # COV_EXCL_LINE
 
 """
     synchronize(A::AbstractArray)
@@ -74,10 +84,3 @@ function synchronize(A::AbstractArray)
     # fallback is a noop, for backends not needing synchronization. This
     # makes it easier to write generic code that also works for AbstractArrays
 end
-
-function thread_blocks_heuristic(len::Integer)
-    # TODO better threads default
-    threads = clamp(len, 1, 256)
-    blocks = max(ceil(Int, len / threads), 1)
-    (blocks, threads)
-end
diff --git a/src/reference.jl b/src/reference.jl
@@ -64,7 +64,7 @@ end
 Base.getindex(r::JlRefValue) = r.x
 Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[]))
 
-function GPUArrays.gpu_call(::JLBackend, f, args...; blocks::Int, threads::Int,
+function GPUArrays.gpu_call(::JLBackend, f, args, threads::Int, blocks::Int;
                             name::Union{String,Nothing})
     ctx = JLKernelContext(threads, blocks)
     device_args = jlconvert.(args)
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -143,12 +143,6 @@ function test_base(AT)
             @test compare(a-> repeat(a, 4, 3),  AT, rand(Float32, 10, 15))
         end
 
-        @testset "heuristics" begin
-            blocks, threads = thread_blocks_heuristic(0)
-            @test blocks == 1
-            @test threads == 1
-        end
-
         @testset "permutedims" begin
             @test compare(x->permutedims(x, [1, 2]), AT, rand(4, 4))