Simplify gpu_call.

maleadt · maleadt · commit 56d0c1fc4c53 · 2020-01-27T12:27:27.000+01:00
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -14,47 +14,45 @@ Gets the GPUArrays back-end responsible for managing arrays of type `T`.
 backend(::Type{<:AbstractArray}) = error("This array is not a GPU array") # COV_EXCL_LINE
 
 """
-    gpu_call(kernel::Function, A::AbstractGPUArray, args::Tuple, configuration = length(A))
+    gpu_call(kernel::Function, A::AbstractGPUArray, args...; kwargs...)
 
-Calls function `kernel` on the GPU.
-`A` must be an AbstractGPUArray and will help to dispatch to the correct GPU backend
-and supplies queues and contexts.
-Calls the kernel function with `kernel(ctx, args...)`, where ctx is dependant on the backend
-and can be used for getting an index into `A` with `linear_index(ctx)`.
-Optionally, a launch configuration can be supplied in the following way:
-
-    1) A single integer, indicating how many work items (total number of threads) you want to launch.
-        in this case `linear_index(ctx)` will be a number in the range `1:configuration`
-    2) Pass a tuple of integer tuples to define blocks and threads per blocks!
+Calls function `kernel` on the GPU device that backs array `A`, passing along arguments
+`args`. The keyword arguments `kwargs` are not passed along, but are interpreted on the host
+to influence how the kernel is executed. The following keyword arguments are supported:
 
+- `total_threads::Int`: how many threads should be launched _in total_. The actual number of
+   threads and blocks is determined using a heuristic. Defaults to the length of `A` if no
+   other keyword arguments that influence the launch configuration are specified.
+- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
+   launched. This cannot be used in combination with the `total_threads` argument.
 """
-function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(A))
-    ITuple = NTuple{N, Integer} where N
-    # If is a single integer, we assume it to be the global size / total number of threads one wants to launch
-    thread_blocks = if isa(configuration, Integer)
-        thread_blocks_heuristic(configuration)
-    elseif isa(configuration, ITuple)
-        @assert length(configuration) == 1
-        configuration[1], 1
-    elseif isa(configuration, Tuple{ITuple, ITuple})
-        @assert length(configuration[1]) == 1
-        @assert length(configuration[2]) == 1
-        configuration[1][1], configuration[2][1]
+function gpu_call(kernel::Base.Callable, A::AbstractArray, args...;
+                  total_threads::Union{Int,Nothing}=nothing,
+                  threads::Union{Int,Nothing}=nothing,
+                  blocks::Union{Int,Nothing}=nothing,
+                  kwargs...)
+    # determine how many threads/blocks to launch
+    if total_threads===nothing && threads===nothing && blocks===nothing
+        total_threads = length(A)
+    end
+    if total_threads !== nothing
+        if threads !== nothing || blocks !== nothing
+            error("Cannot specify both total_threads and threads/blocks configuration")
+        end
+        threads, blocks = thread_blocks_heuristic(total_threads)
     else
-        error("""Please launch a gpu kernel with a valid configuration.
-            Found: $configurations
-            Configuration needs to be:
-            1) A single integer, indicating how many work items (total number of threads) you want to launch.
-                in this case `linear_index(ctx)` will be a number in the range 1:configuration
-            2) Pass a tuple of integer tuples to define blocks and threads per blocks!
-                `linear_index` will be inbetween 1:prod((blocks..., threads...))
-        """)
+        if threads === nothing
+            threads = 1
+        end
+        if blocks === nothing
+            blocks = 1
+        end
     end
-    _gpu_call(backend(typeof(A)), kernel, A, args, thread_blocks)
+
+    gpu_call(backend(typeof(A)), kernel, args...; threads=threads, blocks=blocks, kwargs...)
 end
 
-# Internal GPU call function, that needs to be overloaded by the backends.
-_gpu_call(::Any, f, A, args, thread_blocks) = error("Not implemented") # COV_EXCL_LINE
+gpu_call(backend::AbstractGPUBackend, kernel, args...; kwargs...) = error("Not implemented") # COV_EXCL_LINE
 
 """
     synchronize(A::AbstractArray)
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -162,8 +162,8 @@ function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices
     dest_offsets = first.(destcrange.indices) .- 1
     src_offsets = first.(srccrange.indices) .- 1
     gpu_call(copy_kernel!, dest,
-             (dest, dest_offsets, src, src_offsets, shape, size(dest), size(src), len),
-             len)
+             dest, dest_offsets, src, src_offsets, shape, size(dest), size(src), len;
+             total_threads=len)
     dest
 end
 
diff --git a/src/host/base.jl b/src/host/base.jl
@@ -62,7 +62,7 @@ end
 function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1)
     o, p = size(a, 1), size(a, 2)
     b = similar(a, o*m, p*n)
-    gpu_call(a, (b, a, o, p, m, n), n) do ctx, b, a, o, p, m, n
+    gpu_call(a, b, a, o, p, m, n; total_threads=n) do ctx, b, a, o, p, m, n
         j = linear_index(ctx)
         j > n && return
         d = (j - 1) * p + 1
@@ -82,7 +82,7 @@ end
 function Base.repeat(a::AbstractGPUVector, m::Int)
     o = length(a)
     b = similar(a, o*m)
-    gpu_call(a, (b, a, o, m), m) do ctx, b, a, o, m
+    gpu_call(a, b, a, o, m; total_threads=m) do ctx, b, a, o, m
         i = linear_index(ctx)
         i > m && return
         c = (i - 1)*o + 1
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -47,7 +47,7 @@ end
 @inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     bc′ = Broadcast.preprocess(dest, bc)
-    gpu_call(dest, (dest, bc′)) do ctx, dest, bc′
+    gpu_call(dest, dest, bc′) do ctx, dest, bc′
         let I = CartesianIndex(@cartesianidx(dest))
             @inbounds dest[I] = bc′[I]
         end
diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -9,7 +9,7 @@ function Base.fill(X::Type{<: AbstractGPUArray{T}}, val, dims::NTuple{N, Integer
     fill!(res, convert(T, val))
 end
 function Base.fill!(A::AbstractGPUArray{T}, x) where T
-    gpu_call(A, (A, convert(T, x))) do ctx, a, val
+    gpu_call(A, A, convert(T, x)) do ctx, a, val
         idx = @linearidx(a, ctx)
         @inbounds a[idx] = val
         return
@@ -30,7 +30,7 @@ end
 
 function (T::Type{<: AbstractGPUArray})(s::UniformScaling, dims::Dims{2})
     res = zeros(T, dims)
-    gpu_call(uniformscaling_kernel, res, (res, size(res, 1), s), minimum(dims))
+    gpu_call(uniformscaling_kernel, res, res, size(res, 1), s; total_threads=minimum(dims))
     res
 end
 (T::Type{<: AbstractGPUArray})(s::UniformScaling, m::Integer, n::Integer) = T(s, Dims((m, n)))
@@ -67,7 +67,7 @@ function Base.convert(AT::Type{<: AbstractGPUArray}, iter)
     if isbits(iter) && isa(isize, Base.HasShape) && style != nothing && isa(ettrait, Base.HasEltype)
         # We can collect on the GPU
         A = similar(AT, eltype_or(AT, eltype(iter)), size(iter))
-        gpu_call(collect_kernel, A, (A, iter, style))
+        gpu_call(collect_kernel, A, A, iter, style)
         A
     else
         convert(AT, collect(iter))
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
@@ -98,7 +98,7 @@ function Base._unsafe_getindex!(dest::AbstractGPUArray, src::AbstractGPUArray, I
         return dest
     end
     idims = map(length, Is)
-    gpu_call(index_kernel, dest, (dest, src, idims, map(x-> to_index(dest, x), Is)))
+    gpu_call(index_kernel, dest, dest, src, idims, map(x-> to_index(dest, x), Is))
     return dest
 end
 
@@ -125,6 +125,7 @@ function Base._unsafe_setindex!(::IndexStyle, dest::T, src, Is::Union{Real, Abst
     idims = length.(Is)
     len = prod(idims)
     src_gpu = adapt(T, src)
-    gpu_call(setindex_kernel!, dest, (dest, src_gpu, idims, map(x-> to_index(dest, x), Is), len), len)
+    gpu_call(setindex_kernel!, dest, dest, src_gpu, idims, map(x-> to_index(dest, x), Is), len;
+             total_threads=len)
     return dest
 end
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
@@ -115,7 +115,7 @@ end
 ## high-level functionality
 
 function LinearAlgebra.transpose!(At::AbstractGPUArray{T, 2}, A::AbstractGPUArray{T, 2}) where T
-    gpu_call(At, (At, A)) do ctx, At, A
+    gpu_call(At, At, A) do ctx, At, A
         idx = @cartesianidx A ctx
         @inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
         return
@@ -129,7 +129,7 @@ end
 
 function LinearAlgebra.permutedims!(dest::AbstractGPUArray, src::AbstractGPUArray, perm) where N
     perm isa Tuple || (perm = Tuple(perm))
-    gpu_call(dest, (dest, src, perm)) do ctx, dest, src, perm
+    gpu_call(dest, dest, src, perm) do ctx, dest, src, perm
         I = @cartesianidx src ctx
         @inbounds dest[genperm(I, perm)...] = src[I...]
         return
diff --git a/src/host/mapreduce.jl b/src/host/mapreduce.jl
@@ -9,7 +9,7 @@ Base.count(pred::Function, A::AbstractGPUArray) = Int(mapreduce(pred, +, A; init
 
 Base.:(==)(A::AbstractGPUArray, B::AbstractGPUArray) = Bool(mapreduce(==, &, A, B; init = true))
 
-LinearAlgebra.ishermitian(A::AbstractGPUMatrix) = acc_mapreduce(==, &, true, A, (adjoint(A),))
+LinearAlgebra.ishermitian(A::AbstractGPUMatrix) = acc_mapreduce(==, &, true, A, adjoint(A))
 
 # hack to get around of fetching the first element of the AbstractGPUArray
 # as a startvalue, which is a bit complicated with the current reduce implementation
@@ -67,11 +67,11 @@ end
 function mapreduce_impl(f, op, ::NamedTuple{()}, A::GPUSrcArray, ::Colon)
     OT = gpu_promote_type(op, gpu_promote_type(f, eltype(A)))
     v0 = startvalue(op, OT) # TODO do this better
-    acc_mapreduce(f, op, v0, A, ())
+    acc_mapreduce(f, op, v0, A)
 end
 
 function mapreduce_impl(f, op, nt::NamedTuple{(:init,)}, A::GPUSrcArray, ::Colon)
-    acc_mapreduce(f, op, nt.init, A, ())
+    acc_mapreduce(f, op, nt.init, A)
 end
 
 function mapreduce_impl(f, op, nt, A::GPUSrcArray, dims)
@@ -80,10 +80,10 @@ end
 
 function acc_mapreduce end
 function Base.mapreduce(f, op, A::GPUSrcArray, B::GPUSrcArray, C::Number; init)
-    acc_mapreduce(f, op, init, A, (B, C))
+    acc_mapreduce(f, op, init, A, B, C)
 end
 function Base.mapreduce(f, op, A::GPUSrcArray, B::GPUSrcArray; init)
-    acc_mapreduce(f, op, init, A, (B,))
+    acc_mapreduce(f, op, init, A, B)
 end
 
 @generated function mapreducedim_kernel(ctx::AbstractKernelContext, f, op, R, A, range::NTuple{N, Any}) where N
@@ -118,7 +118,7 @@ end
 
 function Base._mapreducedim!(f, op, R::AbstractGPUArray, A::GPUSrcArray)
     range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing)
-    gpu_call(mapreducedim_kernel, R, (f, op, R, A, range))
+    gpu_call(mapreducedim_kernel, R, f, op, R, A, range)
     return R
 end
 
@@ -165,17 +165,17 @@ for i = 0:10
 
 end
 
-function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest::Tuple) where {OT}
-    blocksize = 80
+function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest...) where {OT}
+    blocks = 80
     threads = 256
-    if length(A) <= blocksize * threads
+    if length(A) <= blocks * threads
         args = zip(convert_to_cpu(A), convert_to_cpu.(rest)...)
         return mapreduce(x-> f(x...), op, args, init = v0)
     end
-    out = similar(A, OT, (blocksize,))
+    out = similar(A, OT, (blocks,))
     fill!(out, v0)
-    args = (f, op, v0, A, Val{threads}(), out, rest...)
-    gpu_call(reduce_kernel, out, args, ((blocksize,), (threads,)))
+    gpu_call(reduce_kernel, out, f, op, v0, A, Val{threads}(), out, rest...;
+             threads=threads, blocks=blocks)
     reduce(op, Array(out))
 end
 
diff --git a/src/host/random.jl b/src/host/random.jl
@@ -77,7 +77,7 @@ function global_rng(A::AbstractGPUArray)
 end
 
 function Random.rand!(rng::RNG, A::AbstractGPUArray{T}) where T <: Number
-    gpu_call(A, (rng.state, A,)) do ctx, randstates, a
+    gpu_call(A, rng.state, A) do ctx, randstates, a
         idx = linear_index(ctx)
         idx > length(a) && return
         @inbounds a[idx] = gpu_rand(T, ctx, randstates)
diff --git a/src/reference.jl b/src/reference.jl
@@ -52,8 +52,7 @@ end
 to_device(ctx, x::Tuple) = to_device.(Ref(ctx), x)
 to_device(ctx, x) = x
 
-function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{Int, Int})
-    blocks, threads = blocks_threads
+function GPUArrays.gpu_call(::JLBackend, f, args...; blocks::Int, threads::Int)
     ctx = JLKernelContext(threads, blocks)
     device_args = to_device.(Ref(ctx), args)
     tasks = Array{Task}(undef, threads)
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -108,25 +108,25 @@ function test_base(AT)
 
         @testset "ntuple test" begin
             result = AT(Vector{NTuple{3, Float32}}(undef, 1))
-            gpu_call(ntuple_test, result, (result, Val(3)))
+            gpu_call(ntuple_test, result, result, Val(3))
             @test Array(result)[1] == (77, 2*77, 3*77)
             x = 88f0
-            gpu_call(ntuple_closure, result, (result, Val(3), x))
+            gpu_call(ntuple_closure, result, result, Val(3), x)
             @test Array(result)[1] == (x, 2*x, 3*x)
         end
 
         @testset "cartesian iteration" begin
             Ac = rand(Float32, 32, 32)
             A = AT(Ac)
             result = fill!(copy(A), 0.0)
-            gpu_call(cartesian_iter, result, (A, result, size(A)))
+            gpu_call(cartesian_iter, result, A, result, size(A))
             Array(result) == Ac
         end
 
         @testset "Custom kernel from Julia function" begin
             x = AT(rand(Float32, 100))
             y = AT(rand(Float32, 100))
-            gpu_call(clmap!, x, (-, x, y))
+            gpu_call(clmap!, x, -, x, y)
             jy = Array(y)
             @test map!(-, jy, jy) ≈ Array(x)
         end
diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl
@@ -3,43 +3,42 @@ function test_gpuinterface(AT)
         N = 10
         x = AT(Vector{Int}(undef, N))
         x .= 0
-        gpu_call(x, (x,)) do ctx, x
+        gpu_call(x, x) do ctx, x
             x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
 
-        gpu_call(x, (x,), N) do ctx, x
+        gpu_call(x, x; total_threads=N) do ctx, x
             x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
-        configuration = ((N ÷ 2,), (2,))
-        gpu_call(x, (x,), configuration) do ctx, x
+        gpu_call(x, x; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[linear_index(ctx)] = threadidx(ctx)
             return
         end
         @test Array(x) == [1,2,1,2,1,2,1,2,1,2]
 
-        gpu_call(x, (x,), configuration) do ctx, x
+        gpu_call(x, x; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[linear_index(ctx)] = blockidx(ctx)
             return
         end
         @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
         x2 = AT([0])
-        gpu_call(x, (x2,), configuration) do ctx, x
+        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[1] = blockdim(ctx)
             return
         end
         @test Array(x2) == [2]
 
-        gpu_call(x, (x2,), configuration) do ctx, x
+        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[1] = griddim(ctx)
             return
         end
         @test Array(x2) == [5]
 
-        gpu_call(x, (x2,), configuration) do ctx, x
+        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[1] = global_size(ctx)
             return
         end