Merge pull request #243 from JuliaGPU/tb/gpu_call

maleadt · web-flow · commit 02b3fb82f06c · 2020-02-18T10:23:05.000+01:00
Breaking gpu_call interface changes
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -33,35 +33,47 @@ host to influence how the kernel is executed. The following keyword arguments ar
   no other keyword arguments that influence the launch configuration are specified.
 - `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
   launched. This cannot be used in combination with the `total_threads` argument.
+- `name::String`: inform the back end about the name of the kernel to be executed.
+  This can be used to emit better diagnostics, and is useful with anonymous kernels.
 """
 function gpu_call(kernel::Base.Callable, args...;
                   target::AbstractArray=first(args),
                   total_threads::Union{Int,Nothing}=nothing,
                   threads::Union{Int,Nothing}=nothing,
                   blocks::Union{Int,Nothing}=nothing,
-                  kwargs...)
-    # determine how many threads/blocks to launch
+                  name::Union{String,Nothing}=nothing)
+    # non-trivial default values for launch configuration
     if total_threads===nothing && threads===nothing && blocks===nothing
         total_threads = length(target)
-    end
-    if total_threads !== nothing
-        if threads !== nothing || blocks !== nothing
-            error("Cannot specify both total_threads and threads/blocks configuration")
-        end
-        blocks, threads = thread_blocks_heuristic(total_threads)
-    else
+    elseif total_threads===nothing
         if threads === nothing
             threads = 1
         end
         if blocks === nothing
             blocks = 1
         end
+    elseif threads!==nothing || blocks!==nothing
+        error("Cannot specify both total_threads and threads/blocks configuration")
+    end
+
+    if total_threads !== nothing
+        gpu_call(backend(target), kernel, args, total_threads; name=name)
+    else
+        gpu_call(backend(target), kernel, args, threads, blocks; name=name)
     end
+end
 
-    gpu_call(backend(target), kernel, args...; threads=threads, blocks=blocks, kwargs...)
+# gpu_call method with a simple launch configuration heuristic.
+# this can be specialised if more sophisticated heuristics are available.
+function gpu_call(backend::AbstractGPUBackend, kernel, args, total_threads::Int; kwargs...)
+    threads = clamp(total_threads, 1, 256)
+    blocks = max(ceil(Int, total_threads / threads), 1)
+
+    gpu_call(backend, kernel, args, threads, blocks; kwargs...)
 end
 
-gpu_call(backend::AbstractGPUBackend, kernel, args...; kwargs...) = error("Not implemented") # COV_EXCL_LINE
+# bottom-line gpu_call method that is expected to be implemented by the back end
+gpu_call(backend::AbstractGPUBackend, kernel, args, threads::Int, blocks::Int; kwargs...) = error("Not implemented") # COV_EXCL_LINE
 
 """
     synchronize(A::AbstractArray)
@@ -72,10 +84,3 @@ function synchronize(A::AbstractArray)
     # fallback is a noop, for backends not needing synchronization. This
     # makes it easier to write generic code that also works for AbstractArrays
 end
-
-function thread_blocks_heuristic(len::Integer)
-    # TODO better threads default
-    threads = clamp(len, 1, 256)
-    blocks = max(ceil(Int, len / threads), 1)
-    (blocks, threads)
-end
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -60,7 +60,7 @@ end
 @inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     bc′ = Broadcast.preprocess(dest, bc)
-    gpu_call(dest, bc′) do ctx, dest, bc′
+    gpu_call(dest, bc′; name="broadcast") do ctx, dest, bc′
         let I = CartesianIndex(@cartesianidx(dest))
             #@inbounds dest[I] = bc′[I]
             @inbounds let
diff --git a/src/reference.jl b/src/reference.jl
@@ -64,7 +64,8 @@ end
 Base.getindex(r::JlRefValue) = r.x
 Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[]))
 
-function GPUArrays.gpu_call(::JLBackend, f, args...; blocks::Int, threads::Int)
+function GPUArrays.gpu_call(::JLBackend, f, args, threads::Int, blocks::Int;
+                            name::Union{String,Nothing})
     ctx = JLKernelContext(threads, blocks)
     device_args = jlconvert.(args)
     tasks = Array{Task}(undef, threads)
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -143,12 +143,6 @@ function test_base(AT)
             @test compare(a-> repeat(a, 4, 3),  AT, rand(Float32, 10, 15))
         end
 
-        @testset "heuristics" begin
-            blocks, threads = thread_blocks_heuristic(0)
-            @test blocks == 1
-            @test threads == 1
-        end
-
         @testset "permutedims" begin
             @test compare(x->permutedims(x, [1, 2]), AT, rand(4, 4))