Update docs.

maleadt · maleadt · commit 7e330b983e6b · 2020-01-27T12:27:27.000+01:00
diff --git a/docs/src/interface.md b/docs/src/interface.md
@@ -5,83 +5,61 @@ implement the interfaces listed on this page. GPUArrays is design around having
 different array types to represent a GPU array: one that only ever lives on the host, and
 one that actually can be instantiated on the device (i.e. in kernels).
 
-## Host-side
 
-Your host-side array type should build on the `AbstractGPUArray` supertype:
+## Device functionality
 
-```@docs
-AbstractGPUArray
-```
-
-First of all, you should implement operations that are expected to be defined for any
-`AbstractArray` type. Refer to the Julia manual for more details, or look at the `JLArray`
-reference implementation.
-
-To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you
-should provide implementations of the following interfaces:
+Several types and interfaces are related to the device and execution of code on it. First of
+all, you need to provide a type that represents your device and exposes some properties of
+it:
 
 ```@docs
-GPUArrays.unsafe_reinterpret
-```
-
-### Devices
-
-```@docs
-GPUArrays.device
-GPUArrays.synchronize
+GPUArrays.AbstractGPUDevice
+GPUArrays.threads
 ```
 
-### Execution
+Another important set of interfaces relates to executing code on the device:
 
 ```@docs
 GPUArrays.AbstractGPUBackend
-GPUArrays.backend
-```
-
-```@docs
-GPUArrays._gpu_call
+GPUArrays.AbstractKernelContext
+GPUArrays.gpu_call
+GPUArrays.synchronize
+GPUArrays.thread_block_heuristic
 ```
 
-### Linear algebra
+Finally, you need to provide implementations of certain methods that will be executed on the
+device itself:
 
 ```@docs
-GPUArrays.blas_module
-GPUArrays.blasbuffer
+GPUArrays.AbstractDeviceArray
+GPUArrays.LocalMemory
+GPUArrays.synchronize_threads
+GPUArrays.blockidx
+GPUArrays.blockdim
+GPUArrays.threadidx
+GPUArrays.griddim
 ```
 
 
-## Device-side
+## Host abstractions
 
-To work with GPU memory on the device itself, e.g. within a kernel, we need a different
-type: Most functionality will behave differently when running on the GPU, e.g., accessing
-memory directly instead of copying it to the host. We should also take care not to call into
-any host library, such as the Julia runtime or the system's math library.
+You should provide an array type that builds on the `AbstractGPUArray` supertype:
 
 ```@docs
-AbstractDeviceArray
+AbstractGPUArray
 ```
 
-Your device array type should again implement the core elements of the `AbstractArray`
-interface, such as indexing and certain getters. Refer to the Julia manual for more details,
-or look at the `JLDeviceArray` reference implementation.
+First of all, you should implement operations that are expected to be defined for any
+`AbstractArray` type. Refer to the Julia manual for more details, or look at the `JLArray`
+reference implementation.
 
-You should also provide implementations of several "GPU intrinsics". To make sure the
-correct implementation is called, the first argument to these intrinsics will be the kernel
-state object from before.
+To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you
+should provide implementations of the following interfaces:
 
 ```@docs
-GPUArrays.LocalMemory
-GPUArrays.synchronize_threads
-GPUArrays.blockidx_x
-GPUArrays.blockidx_y
-GPUArrays.blockidx_z
-GPUArrays.blockdim_x
-GPUArrays.blockdim_y
-GPUArrays.blockdim_z
-GPUArrays.threadidx_x
-GPUArrays.threadidx_y
-GPUArrays.threadidx_z
-GPUArrays.griddim_x
-GPUArrays.griddim_y
-GPUArrays.griddim_z
+GPUArrays.backend
+GPUArrays.device
+GPUArrays.unsafe_reinterpret
+GPUArrays.blas_module
+GPUArrays.blasbuffer
 ```
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -15,13 +15,13 @@ using Adapt
 # device functionality
 include("device/device.jl")
 include("device/execution.jl")
-## on-device
+## executed on-device
 include("device/abstractarray.jl")
 include("device/indexing.jl")
 include("device/memory.jl")
 include("device/synchronization.jl")
 
-# host array abstraction
+# host abstractions
 include("host/abstractarray.jl")
 include("host/construction.jl")
 ## integrations and specialized methods
diff --git a/src/device/device.jl b/src/device/device.jl
@@ -4,6 +4,13 @@ export AbstractGPUDevice
 
 abstract type AbstractGPUDevice end
 
+"""
+    device(A::AbstractArray)
+
+Gets the device associated to the Array `A`
+"""
+device(A::AbstractArray) = error("This array is not a GPU array") # COV_EXCL_LINE
+
 """
 Hardware threads of device
 """
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -6,7 +6,12 @@ abstract type AbstractGPUBackend end
 
 abstract type AbstractKernelContext end
 
-backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
+"""
+    backend(T::Type{<:AbstractArray})
+
+Gets the GPUArrays back-end responsible for managing arrays of type `T`.
+"""
+backend(::Type{<:AbstractArray}) = error("This array is not a GPU array") # COV_EXCL_LINE
 
 """
     gpu_call(kernel::Function, A::AbstractGPUArray, args::Tuple, configuration = length(A))
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -15,12 +15,8 @@ const AbstractGPUVector{T} = AbstractGPUArray{T, 1}
 const AbstractGPUMatrix{T} = AbstractGPUArray{T, 2}
 const AbstractGPUVecOrMat{T} = Union{AbstractGPUArray{T, 1}, AbstractGPUArray{T, 2}}
 
-"""
-    device(A::AbstractArray)
-
-Gets the device associated to the Array `A`
-"""
-device(A::AbstractArray) = error("Not implemented") # COV_EXCL_LINE
+device(::AbstractGPUDevice) = error("Not implemented") # COV_EXCL_LINE
+backend(::Type{<:AbstractGPUDevice}) = error("Not implemented") # COV_EXCL_LINE
 
 
 # input/output
diff --git a/src/reference.jl b/src/reference.jl
@@ -60,8 +60,8 @@ function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tup
     for blockidx in 1:blocks
         ctx.blockidx = blockidx
         for threadidx in 1:threads
-            thread_state = JLKernelContext(ctx, threadidx)
-            tasks[threadidx] = @async @allowscalar f(thread_state, device_args...)
+            thread_ctx = JLKernelContext(ctx, threadidx)
+            tasks[threadidx] = @async @allowscalar f(thread_ctx, device_args...)
             # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading
             #       (this would require a different synchronization mechanism)
         end
@@ -73,7 +73,7 @@ function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tup
 end
 
 
-## on-device
+## executed on-device
 
 # array type
 
@@ -128,7 +128,7 @@ end
 
 
 #
-# Host array abstraction
+# Host abstractions
 #
 
 struct JLArray{T, N} <: AbstractGPUArray{T, N}
diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl
@@ -6,20 +6,20 @@ function cartesian_iter(state, A, res, Asize)
     return
 end
 
-function clmap!(state, f, out, b)
-    i = linear_index(state) # get the kernel index it gets scheduled on
+function clmap!(ctx, f, out, b)
+    i = linear_index(ctx) # get the kernel index it gets scheduled on
     out[i] = f(b[i])
     return
 end
 
-function ntuple_test(state, result, ::Val{N}) where N
+function ntuple_test(ctx, result, ::Val{N}) where N
     result[1] = ntuple(Val(N)) do i
         Float32(i) * 77f0
     end
     return
 end
 
-function ntuple_closure(state, result, ::Val{N}, testval) where N
+function ntuple_closure(ctx, result, ::Val{N}, testval) where N
     result[1] = ntuple(Val(N)) do i
         Float32(i) * testval
     end
diff --git a/test/testsuite/gpuinterface.jl b/test/testsuite/gpuinterface.jl
@@ -3,44 +3,44 @@ function test_gpuinterface(AT)
         N = 10
         x = AT(Vector{Int}(undef, N))
         x .= 0
-        gpu_call(x, (x,)) do state, x
-            x[linear_index(state)] = 2
+        gpu_call(x, (x,)) do ctx, x
+            x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
 
-        gpu_call(x, (x,), N) do state, x
-            x[linear_index(state)] = 2
+        gpu_call(x, (x,), N) do ctx, x
+            x[linear_index(ctx)] = 2
             return
         end
         @test all(x-> x == 2, Array(x))
         configuration = ((N ÷ 2,), (2,))
-        gpu_call(x, (x,), configuration) do state, x
-            x[linear_index(state)] = threadidx(state)
+        gpu_call(x, (x,), configuration) do ctx, x
+            x[linear_index(ctx)] = threadidx(ctx)
             return
         end
         @test Array(x) == [1,2,1,2,1,2,1,2,1,2]
 
-        gpu_call(x, (x,), configuration) do state, x
-            x[linear_index(state)] = blockidx(state)
+        gpu_call(x, (x,), configuration) do ctx, x
+            x[linear_index(ctx)] = blockidx(ctx)
             return
         end
         @test Array(x) == [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
         x2 = AT([0])
-        gpu_call(x, (x2,), configuration) do state, x
-            x[1] = blockdim(state)
+        gpu_call(x, (x2,), configuration) do ctx, x
+            x[1] = blockdim(ctx)
             return
         end
         @test Array(x2) == [2]
 
-        gpu_call(x, (x2,), configuration) do state, x
-            x[1] = griddim(state)
+        gpu_call(x, (x2,), configuration) do ctx, x
+            x[1] = griddim(ctx)
             return
         end
         @test Array(x2) == [5]
 
-        gpu_call(x, (x2,), configuration) do state, x
-            x[1] = global_size(state)
+        gpu_call(x, (x2,), configuration) do ctx, x
+            x[1] = global_size(ctx)
             return
         end
         @test Array(x2) == [10]