Clean up JLArray reference implementation.

maleadt · maleadt · commit d5a199b0f7a6 · 2020-01-24T09:26:26.000+01:00
diff --git a/src/array.jl b/src/array.jl
@@ -1,4 +1,8 @@
-# reference implementation of the GPUArrays interfaces
+# reference implementation of a CPU-based array type
+
+module JLArrays
+
+using GPUArrays
 
 export JLArray
 
@@ -12,7 +16,11 @@ struct JLArray{T, N} <: AbstractGPUArray{T, N}
 end
 
 
-## construction
+#
+# AbstractArray interface
+#
+
+## typical constructors
 
 # type and dimensionality specified, accepting dims as tuples of Ints
 JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
@@ -29,7 +37,6 @@ JLArray{T}(::UndefInitializer, dims::Integer...) where {T} =
 # empty vector constructor
 JLArray{T,1}() where {T} = JLArray{T,1}(undef, 0)
 
-
 Base.similar(a::JLArray{T,N}) where {T,N} = JLArray{T,N}(undef, size(a))
 Base.similar(a::JLArray{T}, dims::Base.Dims{N}) where {T,N} = JLArray{T,N}(undef, dims)
 Base.similar(a::JLArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = JLArray{T,N}(undef, dims)
@@ -64,6 +71,8 @@ Base.convert(::Type{T}, x::T) where T <: JLArray = x
 
 ## broadcast
 
+using Base.Broadcast: BroadcastStyle, Broadcasted, ArrayStyle
+
 BroadcastStyle(::Type{<:JLArray}) = ArrayStyle{JLArray}()
 
 function Base.similar(bc::Broadcasted{ArrayStyle{JLArray}}, ::Type{T}) where T
@@ -72,29 +81,8 @@ end
 
 Base.similar(bc::Broadcasted{ArrayStyle{JLArray}}, ::Type{T}, dims...) where {T} = JLArray{T}(undef, dims...)
 
-## gpuarray interface
-
-struct JLBackend <: GPUBackend end
-backend(::Type{<:JLArray}) = JLBackend()
-
-"""
-Thread group local memory
-"""
-struct LocalMem{N, T}
-    x::NTuple{N, Vector{T}}
-end
 
-to_device(state, x::JLArray) = x.data
-to_device(state, x::Tuple) = to_device.(Ref(state), x)
-to_device(state, x::Base.RefValue{<: JLArray}) = Base.RefValue(to_device(state, x[]))
-to_device(state, x) = x
-
-to_blocks(state, x) = x
-# unpacks local memory for each block
-to_blocks(state, x::LocalMem) = x.x[blockidx_x(state)]
-
-unsafe_reinterpret(::Type{T}, A::JLArray, size::Tuple) where T =
-    reshape(reinterpret(T, A.data), size)
+## memory operations
 
 function Base.copyto!(dest::Array{T}, d_offset::Integer,
                       source::JLArray{T}, s_offset::Integer,
@@ -103,6 +91,7 @@ function Base.copyto!(dest::Array{T}, d_offset::Integer,
     @boundscheck checkbounds(source, s_offset+amount-1)
     copyto!(dest, d_offset, source.data, s_offset, amount)
 end
+
 function Base.copyto!(dest::JLArray{T}, d_offset::Integer,
                       source::Array{T}, s_offset::Integer,
                       amount::Integer) where T
@@ -111,6 +100,7 @@ function Base.copyto!(dest::JLArray{T}, d_offset::Integer,
     copyto!(dest.data, d_offset, source, s_offset, amount)
     dest
 end
+
 function Base.copyto!(dest::JLArray{T}, d_offset::Integer,
                       source::JLArray{T}, s_offset::Integer,
                       amount::Integer) where T
@@ -120,6 +110,45 @@ function Base.copyto!(dest::JLArray{T}, d_offset::Integer,
     dest
 end
 
+## fft
+
+using AbstractFFTs
+
+# defining our own plan type is the easiest way to pass around the plans in FFTW interface
+# without ambiguities
+
+struct FFTPlan{T}
+    p::T
+end
+
+AbstractFFTs.plan_fft(A::JLArray; kw_args...) = FFTPlan(plan_fft(A.data; kw_args...))
+AbstractFFTs.plan_fft!(A::JLArray; kw_args...) = FFTPlan(plan_fft!(A.data; kw_args...))
+AbstractFFTs.plan_bfft!(A::JLArray; kw_args...) = FFTPlan(plan_bfft!(A.data; kw_args...))
+AbstractFFTs.plan_bfft(A::JLArray; kw_args...) = FFTPlan(plan_bfft(A.data; kw_args...))
+AbstractFFTs.plan_ifft!(A::JLArray; kw_args...) = FFTPlan(plan_ifft!(A.data; kw_args...))
+AbstractFFTs.plan_ifft(A::JLArray; kw_args...) = FFTPlan(plan_ifft(A.data; kw_args...))
+
+function Base.:(*)(plan::FFTPlan, A::JLArray)
+    x = plan.p * A.data
+    JLArray(x)
+end
+
+
+
+#
+# AbstractGPUArray interface
+#
+
+GPUArrays.unsafe_reinterpret(::Type{T}, A::JLArray, size::Tuple) where T =
+    reshape(reinterpret(T, A.data), size)
+
+
+## execution
+
+struct JLBackend <: AbstractGPUBackend end
+
+GPUArrays.backend(::Type{<:JLArray}) = JLBackend()
+
 mutable struct JLState{N}
     blockdim::NTuple{N, Int}
     griddim::NTuple{N, Int}
@@ -148,27 +177,12 @@ function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
     )
 end
 
-function LocalMemory(state::JLState, ::Type{T}, ::Val{N}, ::Val{C}) where {T, N, C}
-    state.localmem_counter += 1
-    lmems = state.localmems[blockidx_x(state)]
-    # first invocation in block
-    if length(lmems) < state.localmem_counter
-        lmem = fill(zero(T), N)
-        push!(lmems, lmem)
-        return lmem
-    else
-        return lmems[state.localmem_counter]
-    end
-end
-
-function AbstractDeviceArray(ptr::Array, shape::NTuple{N, Integer}) where N
-    reshape(ptr, shape)
-end
-function AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N
-    reshape(ptr, shape)
-end
+to_device(state, x::JLArray) = x.data
+to_device(state, x::Tuple) = to_device.(Ref(state), x)
+to_device(state, x::Base.RefValue{<: JLArray}) = Base.RefValue(to_device(state, x[]))
+to_device(state, x) = x
 
-function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
+function GPUArrays._gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
     blocks, threads = blocks_threads
     idx = ntuple(i-> 1, length(blocks))
     blockdim = blocks
@@ -177,10 +191,9 @@ function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T})
     tasks = Array{Task}(undef, threads...)
     for blockidx in CartesianIndices(blockdim)
         state.blockidx = blockidx.I
-        block_args = to_blocks.(Ref(state), device_args)
         for threadidx in CartesianIndices(threads)
             thread_state = JLState(state, threadidx.I)
-            tasks[threadidx] = @async @allowscalar f(thread_state, block_args...)
+            tasks[threadidx] = @async @allowscalar f(thread_state, device_args...)
             # TODO: @async obfuscates the trace to any exception which happens during f
         end
         for t in tasks
@@ -190,47 +203,69 @@ function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T})
     return
 end
 
-# "intrinsics"
-struct JLDevice end
-device(x::JLArray) = JLDevice()
-threads(dev::JLDevice) = 256
-
-@inline function synchronize_threads(::JLState)
-    #=
-    All threads are getting started asynchronously,so a yield will
-    yield to the next execution of the same function, which should call yield
-    at the exact same point in the program, leading to a chain of yields  effectively syncing
-    the tasks (threads).
-    =#
+
+## gpu intrinsics
+
+@inline function GPUArrays.synchronize_threads(::JLState)
+    # All threads are getting started asynchronously, so a yield will yield to the next
+    # execution of the same function, which should call yield at the exact same point in the
+    # program, leading to a chain of yields effectively syncing the tasks (threads).
     yield()
     return
 end
 
-for (i, sym) in enumerate((:x, :y, :z))
-    for f in (:blockidx, :blockdim, :threadidx, :griddim)
-        fname = Symbol(string(f, '_', sym))
-        @eval $fname(state::JLState) = Int(state.$f[$i])
+function GPUArrays.LocalMemory(state::JLState, ::Type{T}, ::Val{N}, ::Val{C}) where {T, N, C}
+    state.localmem_counter += 1
+    lmems = state.localmems[blockidx_x(state)]
+
+    # first invocation in block
+    if length(lmems) < state.localmem_counter
+        lmem = fill(zero(T), N)
+        push!(lmems, lmem)
+        return lmem
+    else
+        return lmems[state.localmem_counter]
     end
 end
 
-blas_module(::JLArray) = LinearAlgebra.BLAS
-blasbuffer(A::JLArray) = A.data
 
-# defining our own plan type is the easiest way to pass around the plans in FFTW interface
-# without ambiguities
+## device properties
 
-struct FFTPlan{T}
-    p::T
+struct JLDevice end
+
+GPUArrays.device(x::JLArray) = JLDevice()
+
+GPUArrays.threads(dev::JLDevice) = 256
+
+
+## linear algebra
+
+using LinearAlgebra
+
+GPUArrays.blas_module(::JLArray) = LinearAlgebra.BLAS
+GPUArrays.blasbuffer(A::JLArray) = A.data
+
+
+
+#
+# AbstractDeviceArray interface
+#
+
+function GPUArrays.AbstractDeviceArray(ptr::Array, shape::NTuple{N, Integer}) where N
+    reshape(ptr, shape)
+end
+function GPUArrays.AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N
+    reshape(ptr, shape)
 end
 
-AbstractFFTs.plan_fft(A::JLArray; kw_args...) = FFTPlan(plan_fft(A.data; kw_args...))
-AbstractFFTs.plan_fft!(A::JLArray; kw_args...) = FFTPlan(plan_fft!(A.data; kw_args...))
-AbstractFFTs.plan_bfft!(A::JLArray; kw_args...) = FFTPlan(plan_bfft!(A.data; kw_args...))
-AbstractFFTs.plan_bfft(A::JLArray; kw_args...) = FFTPlan(plan_bfft(A.data; kw_args...))
-AbstractFFTs.plan_ifft!(A::JLArray; kw_args...) = FFTPlan(plan_ifft!(A.data; kw_args...))
-AbstractFFTs.plan_ifft(A::JLArray; kw_args...) = FFTPlan(plan_ifft(A.data; kw_args...))
 
-function Base.:(*)(plan::FFTPlan, A::JLArray)
-    x = plan.p * A.data
-    JLArray(x)
+## indexing
+
+for (i, sym) in enumerate((:x, :y, :z))
+    for f in (:blockidx, :blockdim, :threadidx, :griddim)
+        fname = Symbol(string(f, '_', sym))
+        @eval GPUArrays.$fname(state::JLState) = Int(state.$f[$i])
+    end
+end
+
 end
diff --git a/src/host/execution.jl b/src/host/execution.jl
@@ -1,8 +1,8 @@
 # kernel execution
 
-export gpu_call, synchronize, thread_blocks_heuristic
+export AbstractGPUBackend, gpu_call, synchronize, thread_blocks_heuristic
 
-abstract type GPUBackend end
+abstract type AbstractGPUBackend end
 
 backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
 
diff --git a/src/host/indexing.jl b/src/host/indexing.jl
@@ -1,5 +1,8 @@
 # host-level indexing
 
+export allowscalar, @allowscalar, assertscalar
+
+
 # mechanism to disallow scalar operations
 
 const scalar_allowed = Ref(true)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,5 +3,6 @@ using GPUArrays, Test
 include("testsuite.jl")
 
 @testset "JLArray" begin
+    using GPUArrays.JLArrays
     TestSuite.test(JLArray)
 end