Make AbstractDeviceArray a proper type.

maleadt · maleadt · commit 3f2ec79c1f69 · 2020-01-24T11:13:41.000+01:00
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -15,7 +15,7 @@ using Adapt
 # device array
 include("device/abstractarray.jl")
 include("device/indexing.jl")
-include("device/gpu.jl")
+include("device/synchronization.jl")
 
 # host array
 include("host/abstractarray.jl")
diff --git a/src/array.jl b/src/array.jl
@@ -6,6 +6,14 @@ using GPUArrays
 
 export JLArray
 
+
+#
+# Host array
+#
+
+# the definition of a host array type, implementing different Base interfaces
+# to make it function properly and behave like the Base Array type.
+
 struct JLArray{T, N} <: AbstractGPUArray{T, N}
     data::Array{T, N}
     dims::Dims{N}
@@ -15,12 +23,7 @@ struct JLArray{T, N} <: AbstractGPUArray{T, N}
     end
 end
 
-
-#
-# AbstractArray interface
-#
-
-## typical constructors
+## constructors
 
 # type and dimensionality specified, accepting dims as tuples of Ints
 JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
@@ -139,6 +142,8 @@ end
 # AbstractGPUArray interface
 #
 
+# implementation of GPUArrays-specific interfaces
+
 GPUArrays.unsafe_reinterpret(::Type{T}, A::JLArray, size::Tuple) where T =
     reshape(reinterpret(T, A.data), size)
 
@@ -177,7 +182,7 @@ function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
     )
 end
 
-to_device(state, x::JLArray) = x.data
+to_device(state, x::JLArray{T,N}) where {T,N} = JLDeviceArray{T,N}(x.data, x.dims)
 to_device(state, x::Tuple) = to_device.(Ref(state), x)
 to_device(state, x::Base.RefValue{<: JLArray}) = Base.RefValue(to_device(state, x[]))
 to_device(state, x) = x
@@ -205,31 +210,6 @@ function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tup
 end
 
 
-## gpu intrinsics
-
-@inline function GPUArrays.synchronize_threads(::JLState)
-    # All threads are getting started asynchronously, so a yield will yield to the next
-    # execution of the same function, which should call yield at the exact same point in the
-    # program, leading to a chain of yields effectively syncing the tasks (threads).
-    yield()
-    return
-end
-
-function GPUArrays.LocalMemory(state::JLState, ::Type{T}, ::Val{N}, ::Val{C}) where {T, N, C}
-    state.localmem_counter += 1
-    lmems = state.localmems[blockidx_x(state)]
-
-    # first invocation in block
-    if length(lmems) < state.localmem_counter
-        lmem = fill(zero(T), N)
-        push!(lmems, lmem)
-        return lmem
-    else
-        return lmems[state.localmem_counter]
-    end
-end
-
-
 ## device properties
 
 struct JLDevice end
@@ -249,24 +229,65 @@ GPUArrays.blasbuffer(A::JLArray) = A.data
 
 
 #
-# AbstractDeviceArray interface
+# Device array
 #
 
-function GPUArrays.AbstractDeviceArray(ptr::Array, shape::NTuple{N, Integer}) where N
-    reshape(ptr, shape)
+# definition of a minimal device array type that supports the subset of operations
+# that are used in GPUArrays kernels
+
+struct JLDeviceArray{T, N} <: AbstractDeviceArray{T, N}
+    data::Array{T, N}
+    dims::Dims{N}
+
+    function JLDeviceArray{T,N}(data::Array{T, N}, dims::Dims{N}) where {T,N}
+        new(data, dims)
+    end
 end
-function GPUArrays.AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N
-    reshape(ptr, shape)
+
+function GPUArrays.LocalMemory(state::JLState, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
+    state.localmem_counter += 1
+    lmems = state.localmems[blockidx_x(state)]
+
+    # first invocation in block
+    data = if length(lmems) < state.localmem_counter
+        lmem = fill(zero(T), dims)
+        push!(lmems, lmem)
+        lmem
+    else
+        lmems[state.localmem_counter]
+    end
+
+    N = length(dims)
+    JLDeviceArray{T,N}(data, tuple(dims...))
 end
 
 
+## array interface
+
+Base.size(x::JLDeviceArray) = x.dims
+
+
 ## indexing
 
+@inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(A.data, index)
+@inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(A.data, x, index)
+
 for (i, sym) in enumerate((:x, :y, :z))
     for f in (:blockidx, :blockdim, :threadidx, :griddim)
         fname = Symbol(string(f, '_', sym))
         @eval GPUArrays.$fname(state::JLState) = Int(state.$f[$i])
     end
 end
 
+
+## synchronization
+
+@inline function GPUArrays.synchronize_threads(::JLState)
+    # All threads are getting started asynchronously, so a yield will yield to the next
+    # execution of the same function, which should call yield at the exact same point in the
+    # program, leading to a chain of yields effectively syncing the tasks (threads).
+    yield()
+    return
+end
+
 end
diff --git a/src/device/abstractarray.jl b/src/device/abstractarray.jl
@@ -1,6 +1,6 @@
 # on-device functionality
 
-export AbstractDeviceArray
+export AbstractDeviceArray, @LocalMemory
 
 
 ## device array
@@ -24,3 +24,29 @@ function Base.sum(A::AbstractDeviceArray{T}) where T
     end
     acc
 end
+
+
+## thread-local array
+
+const shmem_counter = Ref{Int}(0)
+
+"""
+Creates a local static memory shared inside one block.
+Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
+"""
+macro LocalMemory(state, T, N)
+    id = (shmem_counter[] += 1)
+    quote
+        LocalMemory($(esc(state)), $(esc(T)), Val($(esc(N))), Val($id))
+    end
+end
+
+"""
+Creates a block local array pointer with `T` being the element type
+and `N` the length. Both T and N need to be static! C is a counter for
+approriately get the correct Local mem id in CUDAnative.
+This is an internal method which needs to be overloaded by the GPU Array backends
+"""
+function LocalMemory(state, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
+    error("Not implemented") # COV_EXCL_LINE
+end
diff --git a/src/device/gpu.jl b/src/device/gpu.jl
diff --git a/src/device/synchronization.jl b/src/device/synchronization.jl
@@ -0,0 +1,13 @@
+# synchronization
+
+export synchronize_threads
+
+"""
+     synchronize_threads(state)
+
+in CUDA terms `__synchronize`
+in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
+"""
+function synchronize_threads(state)
+    error("Not implemented") # COV_EXCL_LINE
+end