Reshuffle and redocument some code.

maleadt · maleadt · commit f641b2f745e3 · 2020-01-23T09:20:39.000+01:00
diff --git a/src/abstract_gpu_interface.jl b/src/abstract_gpu_interface.jl
@@ -1,111 +1,3 @@
-#=
-Abstraction over the GPU thread indexing functions.
-Uses CUDA like names
-=#
-for sym in (:x, :y, :z)
-    for f in (:blockidx, :blockdim, :threadidx, :griddim)
-        fname = Symbol(string(f, '_', sym))
-        @eval $fname(state)::Int = error("Not implemented")
-        @eval export $fname
-    end
-end
-
-
-
-"""
-     synchronize_threads(state)
-
-in CUDA terms `__synchronize`
-in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
-"""
-function synchronize_threads(state)
-    error("Not implemented")
-end
-
-
-"""
-    linear_index(state)
-
-linear index corresponding to each kernel launch (in OpenCL equal to get_global_id).
-
-"""
-@inline function linear_index(state)
-    (blockidx_x(state) - 1) * blockdim_x(state) + threadidx_x(state)
-end
-
-"""
-    linearidx(A, statesym = :state)
-
-Macro form of `linear_index`, which calls return when out of bounds.
-So it can be used like this:
-
-    ```julia
-    function kernel(state, A)
-        idx = @linear_index A state
-        # from here on it's save to index into A with idx
-        @inbounds begin
-            A[idx] = ...
-        end
-    end
-    ```
-"""
-macro linearidx(A, statesym = :state)
-    quote
-        x1 = $(esc(A))
-        i1 = linear_index($(esc(statesym)))
-        i1 > length(x1) && return
-        i1
-    end
-end
-
-
-"""
-    cartesianidx(A, statesym = :state)
-
-Like [`@linearidx(A, statesym = :state)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
-"""
-macro cartesianidx(A, statesym = :state)
-    quote
-        x = $(esc(A))
-        i2 = @linearidx(x, $(esc(statesym)))
-        gpu_ind2sub(x, i2)
-    end
-end
-
-"""
-    global_size(state)
-
-Global size == blockdim * griddim == total number of kernel execution
-"""
-@inline function global_size(state)
-    # TODO nd version
-    griddim_x(state) * blockdim_x(state)
-end
-
-"""
-    device(A::AbstractArray)
-
-Gets the device associated to the Array `A`
-"""
-function device(A::AbstractArray)
-    # fallback is a noop, for backends not needing synchronization. This
-    # makes it easier to write generic code that also works for AbstractArrays
-end
-
-"""
-    synchronize(A::AbstractArray)
-
-Blocks until all operations are finished on `A`
-"""
-function synchronize(A::AbstractArray)
-    # fallback is a noop, for backends not needing synchronization. This
-    # makes it easier to write generic code that also works for AbstractArrays
-end
-#
-# @inline function synchronize_threads(state)
-#     CUDAnative.__syncthreads()
-# end
-
 abstract type GPUBackend end
 backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
 
@@ -153,3 +45,24 @@ end
 
 # Internal GPU call function, that needs to be overloaded by the backends.
 _gpu_call(::Any, f, A, args, thread_blocks) = error("Not implemented")
+
+
+"""
+    device(A::AbstractArray)
+
+Gets the device associated to the Array `A`
+"""
+function device(A::AbstractArray)
+    # fallback is a noop, for backends not needing synchronization. This
+    # makes it easier to write generic code that also works for AbstractArrays
+end
+
+"""
+    synchronize(A::AbstractArray)
+
+Blocks until all operations are finished on `A`
+"""
+function synchronize(A::AbstractArray)
+    # fallback is a noop, for backends not needing synchronization. This
+    # makes it easier to write generic code that also works for AbstractArrays
+end
diff --git a/src/abstractarray.jl b/src/abstractarray.jl
@@ -1,4 +1,5 @@
-# Dense GPU Array
+# core definition of the GPUArray type
+
 abstract type GPUArray{T, N} <: DenseArray{T, N} end
 
 # Sampler type that acts like a texture/image and allows interpolated access
diff --git a/src/blas.jl b/src/blas.jl
@@ -1,14 +1,18 @@
 
-# Interface that needs to be overwritten by backend
-# Slightly difference behavior from buffer, since not all blas backends work directly with
-# the gpu array buffer
+# calls to standard BLAS interfaces
+
+## interface
+
 function blas_module(A)
     error("$(typeof(A)) doesn't support BLAS operations")
 end
 function blasbuffer(A)
     error("$(typeof(A)) doesn't support BLAS operations")
 end
 
+
+## operations
+
 for elty in (Float32, Float64, ComplexF32, ComplexF64)
     T = VERSION >= v"1.3.0-alpha.115" ? :(Union{($elty), Bool}) : elty
     @eval begin
@@ -53,7 +57,6 @@ function LinearAlgebra.rmul!(X::GPUArray{T}, s::Number) where T <: Union{Float32
     X
 end
 
-
 for elty in (Float32, Float64, ComplexF32, ComplexF64)
     T = VERSION >= v"1.3.0-alpha.115" ? :(Union{($elty), Bool}) : elty
     @eval begin
@@ -76,7 +79,6 @@ for elty in (Float32, Float64, ComplexF32, ComplexF64)
     end
 end
 
-
 for elty in (Float32, Float64, ComplexF32, ComplexF64)
     @eval begin
         function BLAS.axpy!(
@@ -92,7 +94,6 @@ for elty in (Float32, Float64, ComplexF32, ComplexF64)
     end
 end
 
-
 for elty in (Float32, Float64, ComplexF32, ComplexF64)
     @eval begin
         function BLAS.gbmv!(trans::AbstractChar, m::Integer, kl::Integer, ku::Integer, alpha::($elty), A::GPUMatrix{$elty}, X::GPUVector{$elty}, beta::($elty), Y::GPUVector{$elty})
diff --git a/src/ondevice.jl b/src/ondevice.jl
@@ -1,3 +1,90 @@
+# functionality for vendor-agnostic kernels
+
+## indexing
+
+# thread indexing functions
+for sym in (:x, :y, :z)
+    for f in (:blockidx, :blockdim, :threadidx, :griddim)
+        fname = Symbol(string(f, '_', sym))
+        @eval $fname(state)::Int = error("Not implemented")
+        @eval export $fname
+    end
+end
+
+"""
+    global_size(state)
+
+Global size == blockdim * griddim == total number of kernel execution
+"""
+@inline function global_size(state)
+    # TODO nd version
+    griddim_x(state) * blockdim_x(state)
+end
+
+"""
+    linear_index(state)
+
+linear index corresponding to each kernel launch (in OpenCL equal to get_global_id).
+
+"""
+@inline function linear_index(state)
+    (blockidx_x(state) - 1) * blockdim_x(state) + threadidx_x(state)
+end
+
+"""
+    linearidx(A, statesym = :state)
+
+Macro form of `linear_index`, which calls return when out of bounds.
+So it can be used like this:
+
+    ```julia
+    function kernel(state, A)
+        idx = @linear_index A state
+        # from here on it's save to index into A with idx
+        @inbounds begin
+            A[idx] = ...
+        end
+    end
+    ```
+"""
+macro linearidx(A, statesym = :state)
+    quote
+        x1 = $(esc(A))
+        i1 = linear_index($(esc(statesym)))
+        i1 > length(x1) && return
+        i1
+    end
+end
+
+"""
+    cartesianidx(A, statesym = :state)
+
+Like [`@linearidx(A, statesym = :state)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
+"""
+macro cartesianidx(A, statesym = :state)
+    quote
+        x = $(esc(A))
+        i2 = @linearidx(x, $(esc(statesym)))
+        gpu_ind2sub(x, i2)
+    end
+end
+
+
+## synchronization
+
+"""
+     synchronize_threads(state)
+
+in CUDA terms `__synchronize`
+in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
+"""
+function synchronize_threads(state)
+    error("Not implemented")
+end
+
+
+## device array
+
 abstract type AbstractDeviceArray{T, N} <: AbstractArray{T, N} end
 
 Base.IndexStyle(::AbstractDeviceArray) = IndexLinear()
@@ -19,6 +106,8 @@ function Base.sum(A::AbstractDeviceArray{T}) where T
 end
 
 
+## device memory
+
 const shmem_counter = Ref{Int}(0)
 
 """