Shuffle some more around, rename State to Context.

maleadt · maleadt · commit b841f668e5c4 · 2020-01-27T12:27:27.000+01:00
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -12,17 +12,19 @@ using AbstractFFTs
 
 using Adapt
 
-# device array
+# device functionality
+include("device/device.jl")
+include("device/execution.jl")
+## on-device
 include("device/abstractarray.jl")
 include("device/indexing.jl")
+include("device/memory.jl")
 include("device/synchronization.jl")
 
-# host array
+# host array abstraction
 include("host/abstractarray.jl")
-include("host/devices.jl")
-include("host/execution.jl")
 include("host/construction.jl")
-## integrations and specialized functionality
+## integrations and specialized methods
 include("host/base.jl")
 include("host/indexing.jl")
 include("host/broadcast.jl")
@@ -32,7 +34,7 @@ include("host/random.jl")
 include("host/quirks.jl")
 
 # CPU reference implementation
-include("array.jl")
+include("reference.jl")
 
 
 end # module
diff --git a/src/device/abstractarray.jl b/src/device/abstractarray.jl
@@ -1,6 +1,6 @@
-# on-device functionality
+# on-device array type
 
-export AbstractDeviceArray, @LocalMemory
+export AbstractDeviceArray
 
 
 ## device array
@@ -31,29 +31,3 @@ function Base.sum(A::AbstractDeviceArray{T}) where T
     end
     acc
 end
-
-
-## thread-local array
-
-const shmem_counter = Ref{Int}(0)
-
-"""
-Creates a local static memory shared inside one block.
-Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
-"""
-macro LocalMemory(state, T, N)
-    id = (shmem_counter[] += 1)
-    quote
-        LocalMemory($(esc(state)), $(esc(T)), Val($(esc(N))), Val($id))
-    end
-end
-
-"""
-Creates a block local array pointer with `T` being the element type
-and `N` the length. Both T and N need to be static! C is a counter for
-approriately get the correct Local mem id in CUDAnative.
-This is an internal method which needs to be overloaded by the GPU Array backends
-"""
-function LocalMemory(state, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
-    error("Not implemented") # COV_EXCL_LINE
-end
diff --git a/src/device/device.jl b/src/device/device.jl
@@ -0,0 +1,10 @@
+# device management and properties
+
+export AbstractGPUDevice
+
+abstract type AbstractGPUDevice end
+
+"""
+Hardware threads of device
+"""
+threads(::AbstractGPUDevice) = error("Not implemented") # COV_EXCL_LINE
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -1,9 +1,11 @@
 # kernel execution
 
-export AbstractGPUBackend, gpu_call, synchronize, thread_blocks_heuristic
+export AbstractGPUBackend, AbstractKernelContext, gpu_call, synchronize, thread_blocks_heuristic
 
 abstract type AbstractGPUBackend end
 
+abstract type AbstractKernelContext end
+
 backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
 
 """
@@ -12,12 +14,12 @@ backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
 Calls function `kernel` on the GPU.
 `A` must be an AbstractGPUArray and will help to dispatch to the correct GPU backend
 and supplies queues and contexts.
-Calls the kernel function with `kernel(state, args...)`, where state is dependant on the backend
-and can be used for getting an index into `A` with `linear_index(state)`.
+Calls the kernel function with `kernel(ctx, args...)`, where ctx is dependant on the backend
+and can be used for getting an index into `A` with `linear_index(ctx)`.
 Optionally, a launch configuration can be supplied in the following way:
 
     1) A single integer, indicating how many work items (total number of threads) you want to launch.
-        in this case `linear_index(state)` will be a number in the range `1:configuration`
+        in this case `linear_index(ctx)` will be a number in the range `1:configuration`
     2) Pass a tuple of integer tuples to define blocks and threads per blocks!
 
 """
@@ -38,7 +40,7 @@ function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(
             Found: $configurations
             Configuration needs to be:
             1) A single integer, indicating how many work items (total number of threads) you want to launch.
-                in this case `linear_index(state)` will be a number in the range 1:configuration
+                in this case `linear_index(ctx)` will be a number in the range 1:configuration
             2) Pass a tuple of integer tuples to define blocks and threads per blocks!
                 `linear_index` will be inbetween 1:prod((blocks..., threads...))
         """)
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
@@ -5,63 +5,63 @@ export global_size, synchronize_threads, linear_index
 
 # thread indexing functions
 for f in (:blockidx, :blockdim, :threadidx, :griddim)
-    @eval $f(state)::Int = error("Not implemented") # COV_EXCL_LINE
+    @eval $f(ctx::AbstractKernelContext)::Int = error("Not implemented") # COV_EXCL_LINE
     @eval export $f
 end
 
 """
-    global_size(state)
+    global_size(ctx::AbstractKernelContext)
 
 Global size == blockdim * griddim == total number of kernel execution
 """
-@inline function global_size(state)
-    griddim(state) * blockdim(state)
+@inline function global_size(ctx::AbstractKernelContext)
+    griddim(ctx) * blockdim(ctx)
 end
 
 """
-    linear_index(state)
+    linear_index(ctx::AbstractKernelContext)
 
 linear index corresponding to each kernel launch (in OpenCL equal to get_global_id).
 
 """
-@inline function linear_index(state)
-    (blockidx(state) - 1) * blockdim(state) + threadidx(state)
+@inline function linear_index(ctx::AbstractKernelContext)
+    (blockidx(ctx) - 1) * blockdim(ctx) + threadidx(ctx)
 end
 
 """
-    linearidx(A, statesym = :state)
+    linearidx(A, ctxsym = :ctx)
 
 Macro form of `linear_index`, which calls return when out of bounds.
 So it can be used like this:
 
     ```julia
-    function kernel(state, A)
-        idx = @linear_index A state
+    function kernel(ctx::AbstractKernelContext, A)
+        idx = @linear_index A ctx
         # from here on it's save to index into A with idx
         @inbounds begin
             A[idx] = ...
         end
     end
     ```
 """
-macro linearidx(A, statesym = :state)
+macro linearidx(A, ctxsym = :ctx)
     quote
         x1 = $(esc(A))
-        i1 = linear_index($(esc(statesym)))
+        i1 = linear_index($(esc(ctxsym)))
         i1 > length(x1) && return
         i1
     end
 end
 
 """
-    cartesianidx(A, statesym = :state)
+    cartesianidx(A, ctxsym = :ctx)
 
-Like [`@linearidx(A, statesym = :state)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
+Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
 """
-macro cartesianidx(A, statesym = :state)
+macro cartesianidx(A, ctxsym = :ctx)
     quote
         x = $(esc(A))
-        i2 = @linearidx(x, $(esc(statesym)))
+        i2 = @linearidx(x, $(esc(ctxsym)))
         gpu_ind2sub(x, i2)
     end
 end
diff --git a/src/device/memory.jl b/src/device/memory.jl
@@ -0,0 +1,29 @@
+# on-device memory management
+
+export @LocalMemory
+
+
+## thread-local array
+
+const shmem_counter = Ref{Int}(0)
+
+"""
+Creates a local static memory shared inside one block.
+Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
+"""
+macro LocalMemory(ctx, T, N)
+    id = (shmem_counter[] += 1)
+    quote
+        LocalMemory($(esc(ctx)), $(esc(T)), Val($(esc(N))), Val($id))
+    end
+end
+
+"""
+Creates a block local array pointer with `T` being the element type
+and `N` the length. Both T and N need to be static! C is a counter for
+approriately get the correct Local mem id in CUDAnative.
+This is an internal method which needs to be overloaded by the GPU Array backends
+"""
+function LocalMemory(ctx, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
+    error("Not implemented") # COV_EXCL_LINE
+end
diff --git a/src/device/synchronization.jl b/src/device/synchronization.jl
@@ -3,11 +3,11 @@
 export synchronize_threads
 
 """
-     synchronize_threads(state)
+     synchronize_threads(ctx::AbstractKernelContext)
 
 in CUDA terms `__synchronize`
 in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
 """
-function synchronize_threads(state)
+function synchronize_threads(ctx::AbstractKernelContext)
     error("Not implemented") # COV_EXCL_LINE
 end
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -15,6 +15,13 @@ const AbstractGPUVector{T} = AbstractGPUArray{T, 1}
 const AbstractGPUMatrix{T} = AbstractGPUArray{T, 2}
 const AbstractGPUVecOrMat{T} = Union{AbstractGPUArray{T, 1}, AbstractGPUArray{T, 2}}
 
+"""
+    device(A::AbstractArray)
+
+Gets the device associated to the Array `A`
+"""
+device(A::AbstractArray) = error("Not implemented") # COV_EXCL_LINE
+
 
 # input/output
 
@@ -136,8 +143,8 @@ end
 Base.copyto!(dest::AbstractGPUArray, src::AbstractGPUArray) =
     copyto!(dest, CartesianIndices(dest), src, CartesianIndices(src))
 
-function copy_kernel!(state, dest, dest_offsets, src, src_offsets, shape, shape_dest, shape_source, length)
-    i = linear_index(state)
+function copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, shape_dest, shape_source, length)
+    i = linear_index(ctx)
     if i <= length
         # TODO can this be done faster and smarter?
         idx = gpu_ind2sub(shape, i)
diff --git a/src/host/base.jl b/src/host/base.jl
@@ -62,8 +62,8 @@ end
 function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1)
     o, p = size(a, 1), size(a, 2)
     b = similar(a, o*m, p*n)
-    gpu_call(a, (b, a, o, p, m, n), n) do state, b, a, o, p, m, n
-        j = linear_index(state)
+    gpu_call(a, (b, a, o, p, m, n), n) do ctx, b, a, o, p, m, n
+        j = linear_index(ctx)
         j > n && return
         d = (j - 1) * p + 1
         @inbounds for i in 1:m
@@ -82,8 +82,8 @@ end
 function Base.repeat(a::AbstractGPUVector, m::Int)
     o = length(a)
     b = similar(a, o*m)
-    gpu_call(a, (b, a, o, m), m) do state, b, a, o, m
-        i = linear_index(state)
+    gpu_call(a, (b, a, o, m), m) do ctx, b, a, o, m
+        i = linear_index(ctx)
         i > m && return
         c = (i - 1)*o + 1
         @inbounds for i in 1:o
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -47,7 +47,7 @@ end
 @inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     bc′ = Broadcast.preprocess(dest, bc)
-    gpu_call(dest, (dest, bc′)) do state, dest, bc′
+    gpu_call(dest, (dest, bc′)) do ctx, dest, bc′
         let I = CartesianIndex(@cartesianidx(dest))
             @inbounds dest[I] = bc′[I]
         end
diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -9,8 +9,8 @@ function Base.fill(X::Type{<: AbstractGPUArray{T}}, val, dims::NTuple{N, Integer
     fill!(res, convert(T, val))
 end
 function Base.fill!(A::AbstractGPUArray{T}, x) where T
-    gpu_call(A, (A, convert(T, x))) do state, a, val
-        idx = @linearidx(a, state)
+    gpu_call(A, (A, convert(T, x))) do ctx, a, val
+        idx = @linearidx(a, ctx)
         @inbounds a[idx] = val
         return
     end
@@ -20,8 +20,8 @@ end
 Base.zeros(T::Type{<: AbstractGPUArray}, dims::NTuple{N, Integer}) where N = fill(T, zero(eltype(T)), dims)
 Base.ones(T::Type{<: AbstractGPUArray}, dims::NTuple{N, Integer}) where N = fill(T, one(eltype(T)), dims)
 
-function uniformscaling_kernel(state, res::AbstractArray{T}, stride, s::UniformScaling) where T
-    i = linear_index(state)
+function uniformscaling_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, s::UniformScaling) where T
+    i = linear_index(ctx)
     i > stride && return
     ilin = (stride * (i - 1)) + i
     @inbounds res[ilin] = s.λ
@@ -44,14 +44,14 @@ function indexstyle(x::T) where T
     style
 end
 
-function collect_kernel(state, A, iter, ::IndexCartesian)
-    idx = @cartesianidx(A, state)
+function collect_kernel(ctx::AbstractKernelContext, A, iter, ::IndexCartesian)
+    idx = @cartesianidx(A, ctx)
     @inbounds A[idx...] = iter[idx...]
     return
 end
 
-function collect_kernel(state, A, iter, ::IndexLinear)
-    idx = linear_index(state)
+function collect_kernel(ctx::AbstractKernelContext, A, iter, ::IndexLinear)
+    idx = linear_index(ctx)
     @inbounds A[idx] = iter[idx]
     return
 end
diff --git a/src/host/devices.jl b/src/host/devices.jl
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
@@ -81,10 +81,10 @@ to_index(a::A, x::Array{ET}) where {A, ET} = copyto!(similar(a, ET, size(x)...),
 to_index(a, x::UnitRange{<: Integer}) = convert(UnitRange{Int}, x)
 to_index(a, x::Base.LogicalIndex) = error("Logical indexing not implemented")
 
-@generated function index_kernel(state, dest::AbstractArray, src::AbstractArray, idims, Is)
+@generated function index_kernel(ctx::AbstractKernelContext, dest::AbstractArray, src::AbstractArray, idims, Is)
     N = length(Is.parameters)
     quote
-        i = linear_index(state)
+        i = linear_index(ctx)
         i > length(dest) && return
         is = gpu_ind2sub(idims, i)
         @nexprs $N i -> @inbounds I_i = Is[i][is[i]]
@@ -106,11 +106,11 @@ end
 @inline bgetindex(x::AbstractArray, i) = x[i]
 @inline bgetindex(x, i) = x
 
-@generated function setindex_kernel!(state, dest::AbstractArray, src, idims, Is, len)
+@generated function setindex_kernel!(ctx::AbstractKernelContext, dest::AbstractArray, src, idims, Is, len)
     N = length(Is.parameters)
     idx = ntuple(i-> :(Is[$i][is[$i]]), N)
     quote
-        i = linear_index(state)
+        i = linear_index(ctx)
         i > len && return
         is = gpu_ind2sub(idims, i)
         @inbounds setindex!(dest, bgetindex(src, i), $(idx...))
diff --git a/src/host/linalg.jl b/src/host/linalg.jl
diff --git a/src/host/mapreduce.jl b/src/host/mapreduce.jl
diff --git a/src/host/random.jl b/src/host/random.jl
diff --git a/src/reference.jl b/src/reference.jl