Remove redundant array argument.

maleadt · maleadt · commit 5a83d5bd7668 · 2020-01-27T16:48:40.000+01:00
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -7,33 +7,42 @@ abstract type AbstractGPUBackend end
 abstract type AbstractKernelContext end
 
 """
-    backend(T::Type{<:AbstractArray})
+    backend(T::Type)
+    backend(x)
 
 Gets the GPUArrays back-end responsible for managing arrays of type `T`.
 """
-backend(::Type{<:AbstractArray}) = error("This array is not a GPU array") # COV_EXCL_LINE
+backend(::Type) = error("This object is not a GPU array") # COV_EXCL_LINE
+backend(x) = backend(typeof(x))
 
 """
-    gpu_call(kernel::Function, A::AbstractGPUArray, args...; kwargs...)
+    gpu_call(kernel::Function, arg0, args...; kwargs...)
 
-Calls function `kernel` on the GPU device that backs array `A`, passing along arguments
-`args`. The keyword arguments `kwargs` are not passed along, but are interpreted on the host
-to influence how the kernel is executed. The following keyword arguments are supported:
+Executes `kernel` on the device that backs `arg` (see [`backend`](@ref)), passing along any
+arguments `args`. Additionally, the kernel will be passed the kernel execution context (see
+[`AbstractKernelContext`]), so its signature should be `(ctx::AbstractKernelContext, arg0,
+args...)`.
 
+The keyword arguments `kwargs` are not passed to the function, but are interpreted on the
+host to influence how the kernel is executed. The following keyword arguments are supported:
+
+- `target::AbstractArray`: specify which array object to use for determining execution
+  properties (defaults to the first argument `arg0`).
 - `total_threads::Int`: how many threads should be launched _in total_. The actual number of
-   threads and blocks is determined using a heuristic. Defaults to the length of `A` if no
-   other keyword arguments that influence the launch configuration are specified.
+  threads and blocks is determined using a heuristic. Defaults to the length of `arg0` if
+  no other keyword arguments that influence the launch configuration are specified.
 - `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
-   launched. This cannot be used in combination with the `total_threads` argument.
+  launched. This cannot be used in combination with the `total_threads` argument.
 """
-function gpu_call(kernel::Base.Callable, A::AbstractArray, args...;
+function gpu_call(kernel::Base.Callable, args...;
+                  target::AbstractArray=first(args),
                   total_threads::Union{Int,Nothing}=nothing,
                   threads::Union{Int,Nothing}=nothing,
                   blocks::Union{Int,Nothing}=nothing,
                   kwargs...)
     # determine how many threads/blocks to launch
     if total_threads===nothing && threads===nothing && blocks===nothing
-        total_threads = length(A)
+        total_threads = length(target)
     end
     if total_threads !== nothing
         if threads !== nothing || blocks !== nothing
@@ -49,7 +58,7 @@ function gpu_call(kernel::Base.Callable, A::AbstractArray, args...;
         end
     end
 
-    gpu_call(backend(typeof(A)), kernel, args...; threads=threads, blocks=blocks, kwargs...)
+    gpu_call(backend(target), kernel, args...; threads=threads, blocks=blocks, kwargs...)
 end
 
 gpu_call(backend::AbstractGPUBackend, kernel, args...; kwargs...) = error("Not implemented") # COV_EXCL_LINE
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -161,7 +161,7 @@ function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices
 
     dest_offsets = first.(destcrange.indices) .- 1
     src_offsets = first.(srccrange.indices) .- 1
-    gpu_call(copy_kernel!, dest,
+    gpu_call(copy_kernel!,
              dest, dest_offsets, src, src_offsets, shape, size(dest), size(src), len;
              total_threads=len)
     dest
diff --git a/src/host/base.jl b/src/host/base.jl
@@ -62,7 +62,7 @@ end
 function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1)
     o, p = size(a, 1), size(a, 2)
     b = similar(a, o*m, p*n)
-    gpu_call(a, b, a, o, p, m, n; total_threads=n) do ctx, b, a, o, p, m, n
+    gpu_call(b, a, o, p, m, n; target=a, total_threads=n) do ctx, b, a, o, p, m, n
         j = linear_index(ctx)
         j > n && return
         d = (j - 1) * p + 1
@@ -82,7 +82,7 @@ end
 function Base.repeat(a::AbstractGPUVector, m::Int)
     o = length(a)
     b = similar(a, o*m)
-    gpu_call(a, b, a, o, m; total_threads=m) do ctx, b, a, o, m
+    gpu_call(b, a, o, m; target=a, total_threads=m) do ctx, b, a, o, m
         i = linear_index(ctx)
         i > m && return
         c = (i - 1)*o + 1
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -47,7 +47,7 @@ end
 @inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     bc′ = Broadcast.preprocess(dest, bc)
-    gpu_call(dest, dest, bc′) do ctx, dest, bc′
+    gpu_call(dest, bc′) do ctx, dest, bc′
         let I = CartesianIndex(@cartesianidx(dest))
             @inbounds dest[I] = bc′[I]
         end
diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -9,7 +9,7 @@ function Base.fill(X::Type{<: AbstractGPUArray{T}}, val, dims::NTuple{N, Integer
     fill!(res, convert(T, val))
 end
 function Base.fill!(A::AbstractGPUArray{T}, x) where T
-    gpu_call(A, A, convert(T, x)) do ctx, a, val
+    gpu_call(A, convert(T, x)) do ctx, a, val
         idx = @linearidx(a, ctx)
         @inbounds a[idx] = val
         return
@@ -30,7 +30,7 @@ end
 
 function (T::Type{<: AbstractGPUArray})(s::UniformScaling, dims::Dims{2})
     res = zeros(T, dims)
-    gpu_call(uniformscaling_kernel, res, res, size(res, 1), s; total_threads=minimum(dims))
+    gpu_call(uniformscaling_kernel, res, size(res, 1), s; total_threads=minimum(dims))
     res
 end
 (T::Type{<: AbstractGPUArray})(s::UniformScaling, m::Integer, n::Integer) = T(s, Dims((m, n)))
@@ -67,7 +67,7 @@ function Base.convert(AT::Type{<: AbstractGPUArray}, iter)
     if isbits(iter) && isa(isize, Base.HasShape) && style != nothing && isa(ettrait, Base.HasEltype)
         # We can collect on the GPU
         A = similar(AT, eltype_or(AT, eltype(iter)), size(iter))
-        gpu_call(collect_kernel, A, A, iter, style)
+        gpu_call(collect_kernel, A, iter, style)
         A
     else
         convert(AT, collect(iter))
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
@@ -98,7 +98,7 @@ function Base._unsafe_getindex!(dest::AbstractGPUArray, src::AbstractGPUArray, I
         return dest
     end
     idims = map(length, Is)
-    gpu_call(index_kernel, dest, dest, src, idims, map(x-> to_index(dest, x), Is))
+    gpu_call(index_kernel, dest, src, idims, map(x-> to_index(dest, x), Is))
     return dest
 end
 
@@ -125,7 +125,7 @@ function Base._unsafe_setindex!(::IndexStyle, dest::T, src, Is::Union{Real, Abst
     idims = length.(Is)
     len = prod(idims)
     src_gpu = adapt(T, src)
-    gpu_call(setindex_kernel!, dest, dest, src_gpu, idims, map(x-> to_index(dest, x), Is), len;
+    gpu_call(setindex_kernel!, dest, src_gpu, idims, map(x-> to_index(dest, x), Is), len;
              total_threads=len)
     return dest
 end
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
@@ -115,7 +115,7 @@ end
 ## high-level functionality
 
 function LinearAlgebra.transpose!(At::AbstractGPUArray{T, 2}, A::AbstractGPUArray{T, 2}) where T
-    gpu_call(At, At, A) do ctx, At, A
+    gpu_call(At, A) do ctx, At, A
         idx = @cartesianidx A ctx
         @inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
         return
@@ -129,7 +129,7 @@ end
 
 function LinearAlgebra.permutedims!(dest::AbstractGPUArray, src::AbstractGPUArray, perm) where N
     perm isa Tuple || (perm = Tuple(perm))
-    gpu_call(dest, dest, src, perm) do ctx, dest, src, perm
+    gpu_call(dest, src, perm) do ctx, dest, src, perm
         I = @cartesianidx src ctx
         @inbounds dest[genperm(I, perm)...] = src[I...]
         return
diff --git a/src/host/mapreduce.jl b/src/host/mapreduce.jl
@@ -118,7 +118,7 @@ end
 
 function Base._mapreducedim!(f, op, R::AbstractGPUArray, A::GPUSrcArray)
     range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing)
-    gpu_call(mapreducedim_kernel, R, f, op, R, A, range)
+    gpu_call(mapreducedim_kernel, f, op, R, A, range; target=R)
     return R
 end
 
@@ -174,8 +174,8 @@ function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest...) where {OT}
     end
     out = similar(A, OT, (blocks,))
     fill!(out, v0)
-    gpu_call(reduce_kernel, out, f, op, v0, A, Val{threads}(), out, rest...;
-             threads=threads, blocks=blocks)
+    gpu_call(reduce_kernel, f, op, v0, A, Val{threads}(), out, rest...;
+             target=out, threads=threads, blocks=blocks)
     reduce(op, Array(out))
 end
 
diff --git a/src/host/random.jl b/src/host/random.jl
@@ -77,7 +77,7 @@ function global_rng(A::AbstractGPUArray)
 end
 
 function Random.rand!(rng::RNG, A::AbstractGPUArray{T}) where T <: Number
-    gpu_call(A, rng.state, A) do ctx, randstates, a
+    gpu_call(rng.state, A; target=A) do ctx, randstates, a
         idx = linear_index(ctx)
         idx > length(a) && return
         @inbounds a[idx] = gpu_rand(T, ctx, randstates)
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -108,25 +108,25 @@ function test_base(AT)
 
         @testset "ntuple test" begin
             result = AT(Vector{NTuple{3, Float32}}(undef, 1))
-            gpu_call(ntuple_test, result, result, Val(3))
+            gpu_call(ntuple_test, result, Val(3))
             @test Array(result)[1] == (77, 2*77, 3*77)
             x = 88f0
-            gpu_call(ntuple_closure, result, result, Val(3), x)
+            gpu_call(ntuple_closure, result, Val(3), x)
             @test Array(result)[1] == (x, 2*x, 3*x)
         end
 
         @testset "cartesian iteration" begin
             Ac = rand(Float32, 32, 32)
             A = AT(Ac)
             result = fill!(copy(A), 0.0)
-            gpu_call(cartesian_iter, result, A, result, size(A))
+            gpu_call(cartesian_iter, A, result, size(A); target=result)
             Array(result) == Ac
         end
 
         @testset "Custom kernel from Julia function" begin
             x = AT(rand(Float32, 100))
             y = AT(rand(Float32, 100))
-            gpu_call(clmap!, x, -, x, y)
+            gpu_call(clmap!, -, x, y; target=x)
             jy = Array(y)
             @test map!(-, jy, jy) ≈ Array(x)
         end
diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl
@@ -3,42 +3,42 @@ function test_gpuinterface(AT)
         N = 10
         x = AT(Vector{Int}(undef, N))
         x .= 0
-        gpu_call(x, x) do ctx, x
+        gpu_call(x) do ctx, x
             x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
 
-        gpu_call(x, x; total_threads=N) do ctx, x
+        gpu_call(x; total_threads=N) do ctx, x
             x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
-        gpu_call(x, x; threads=2, blocks=(N ÷ 2)) do ctx, x
+        gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[linear_index(ctx)] = threadidx(ctx)
             return
         end
         @test Array(x) == [1,2,1,2,1,2,1,2,1,2]
 
-        gpu_call(x, x; threads=2, blocks=(N ÷ 2)) do ctx, x
+        gpu_call(x; threads=2, blocks=(N ÷ 2)) do ctx, x
             x[linear_index(ctx)] = blockidx(ctx)
             return
         end
         @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
         x2 = AT([0])
-        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
+        gpu_call(x2; threads=2, blocks=(N ÷ 2), target=x) do ctx, x
             x[1] = blockdim(ctx)
             return
         end
         @test Array(x2) == [2]
 
-        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
+        gpu_call(x2; threads=2, blocks=(N ÷ 2), target=x) do ctx, x
             x[1] = griddim(ctx)
             return
         end
         @test Array(x2) == [5]
 
-        gpu_call(x, x2; threads=2, blocks=(N ÷ 2)) do ctx, x
+        gpu_call(x2; threads=2, blocks=(N ÷ 2), target=x) do ctx, x
             x[1] = global_size(ctx)
             return
         end