JuliaGPU
diff --git a/‎src/array.jl
Lines changed: 2 additions & 2 deletions b/‎src/array.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/device/gpu.jl
Lines changed: 45 additions & 0 deletions b/‎src/device/gpu.jl
Lines changed: 45 additions & 0 deletions
diff --git a/‎src/device/indexing.jl
Lines changed: 71 additions & 0 deletions b/‎src/device/indexing.jl
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/host/abstractarray.jl
Lines changed: 30 additions & 30 deletions b/‎src/host/abstractarray.jl
Lines changed: 30 additions & 30 deletions
diff --git a/‎src/host/base.jl
Lines changed: 8 additions & 8 deletions b/‎src/host/base.jl
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/host/broadcast.jl
Lines changed: 9 additions & 9 deletions b/‎src/host/broadcast.jl
Lines changed: 9 additions & 9 deletions
@@ -1,8 +1,8 @@
-# reference implementation of the GPUArray interfaces
+# reference implementation of the GPUArrays interfaces
 
 export JLArray
 
-struct JLArray{T, N} <: GPUArray{T, N}
+struct JLArray{T, N} <: AbstractGPUArray{T, N}
     data::Array{T, N}
     dims::Dims{N}
 
 
@@ -0,0 +1,45 @@
+# gpu-specific functionality
+
+export global_size, synchronize_threads
+
+
+## synchronization
+
+"""
+     synchronize_threads(state)
+
+in CUDA terms `__synchronize`
+in OpenCL terms: `barrier(CLK_LOCAL_MEM_FENCE)`
+"""
+function synchronize_threads(state)
+    error("Not implemented") # COV_EXCL_LINE
+end
+
+
+## device memory
+
+const shmem_counter = Ref{Int}(0)
+
+"""
+Creates a local static memory shared inside one block.
+Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
+"""
+macro LocalMemory(state, T, N)
+    id = (shmem_counter[] += 1)
+    quote
+        lémem = LocalMemory($(esc(state)), $(esc(T)), Val($(esc(N))), Val($id))
+        AbstractDeviceArray(lémem, $(esc(N)))
+    end
+end
+
+export @LocalMemory
+
+"""
+Creates a block local array pointer with `T` being the element type
+and `N` the length. Both T and N need to be static! C is a counter for
+approriately get the correct Local mem id in CUDAnative.
+This is an internal method which needs to be overloaded by the GPU Array backends
+"""
+function LocalMemory(state, ::Type{T}, ::Val{N}, ::Val{C}) where {N, T, C}
+    error("Not implemented") # COV_EXCL_LINE
+end
@@ -0,0 +1,71 @@
+# indexing
+
+export global_size, synchronize_threads, linear_index
+
+
+# thread indexing functions
+for sym in (:x, :y, :z)
+    for f in (:blockidx, :blockdim, :threadidx, :griddim)
+        fname = Symbol(string(f, '_', sym))
+        @eval $fname(state)::Int = error("Not implemented") # COV_EXCL_LINE
+        @eval export $fname
+    end
+end
+
+"""
+    global_size(state)
+
+Global size == blockdim * griddim == total number of kernel execution
+"""
+@inline function global_size(state)
+    # TODO nd version
+    griddim_x(state) * blockdim_x(state)
+end
+
+"""
+    linear_index(state)
+
+linear index corresponding to each kernel launch (in OpenCL equal to get_global_id).
+
+"""
+@inline function linear_index(state)
+    (blockidx_x(state) - 1) * blockdim_x(state) + threadidx_x(state)
+end
+
+"""
+    linearidx(A, statesym = :state)
+
+Macro form of `linear_index`, which calls return when out of bounds.
+So it can be used like this:
+
+    ```julia
+    function kernel(state, A)
+        idx = @linear_index A state
+        # from here on it's save to index into A with idx
+        @inbounds begin
+            A[idx] = ...
+        end
+    end
+    ```
+"""
+macro linearidx(A, statesym = :state)
+    quote
+        x1 = $(esc(A))
+        i1 = linear_index($(esc(statesym)))
+        i1 > length(x1) && return
+        i1
+    end
+end
+
+"""
+    cartesianidx(A, statesym = :state)
+
+Like [`@linearidx(A, statesym = :state)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
+"""
+macro cartesianidx(A, statesym = :state)
+    quote
+        x = $(esc(A))
+        i2 = @linearidx(x, $(esc(statesym)))
+        gpu_ind2sub(x, i2)
+    end
+end
@@ -1,27 +1,27 @@
-# core definition of the GPUArray type
+# core definition of the AbstractGPUArray type
 
-export GPUArray
+export AbstractGPUArray
 
-abstract type GPUArray{T, N} <: DenseArray{T, N} end
+abstract type AbstractGPUArray{T, N} <: DenseArray{T, N} end
 
 # Sampler type that acts like a texture/image and allows interpolated access
 abstract type Sampler{T, N} <: DenseArray{T, N} end
 
-const GPUVector{T} = GPUArray{T, 1}
-const GPUMatrix{T} = GPUArray{T, 2}
-const GPUVecOrMat{T} = Union{GPUArray{T, 1}, GPUArray{T, 2}}
+const GPUVector{T} = AbstractGPUArray{T, 1}
+const GPUMatrix{T} = AbstractGPUArray{T, 2}
+const GPUVecOrMat{T} = Union{AbstractGPUArray{T, 1}, AbstractGPUArray{T, 2}}
 
 # input/output
 
 ## serialization
 
 import Serialization: AbstractSerializer, serialize, deserialize, serialize_type
 
-function serialize(s::AbstractSerializer, t::T) where T <: GPUArray
+function serialize(s::AbstractSerializer, t::T) where T <: AbstractGPUArray
     serialize_type(s, T)
     serialize(s, Array(t))
 end
-function deserialize(s::AbstractSerializer, ::Type{T}) where T <: GPUArray
+function deserialize(s::AbstractSerializer, ::Type{T}) where T <: AbstractGPUArray
     A = deserialize(s)
     T(A)
 end
@@ -56,15 +56,15 @@ convert_to_cpu(xs) = adapt(Array, xs)
 for (W, ctor) in (:AT => (A,mut)->mut(A), Adapt.wrappers...)
     @eval begin
         # display
-        Base.print_array(io::IO, X::$W where {AT <: GPUArray}) =
+        Base.print_array(io::IO, X::$W where {AT <: AbstractGPUArray}) =
             Base.print_array(io, $ctor(X, convert_to_cpu))
 
         # show
-        Base._show_nonempty(io::IO, X::$W where {AT <: GPUArray}, prefix::String) =
+        Base._show_nonempty(io::IO, X::$W where {AT <: AbstractGPUArray}, prefix::String) =
             Base._show_nonempty(io, $ctor(X, convert_to_cpu), prefix)
-        Base._show_empty(io::IO, X::$W where {AT <: GPUArray}) =
+        Base._show_empty(io::IO, X::$W where {AT <: AbstractGPUArray}) =
             Base._show_empty(io, $ctor(X, convert_to_cpu))
-        Base.show_vector(io::IO, v::$W where {AT <: GPUArray}, args...) =
+        Base.show_vector(io::IO, v::$W where {AT <: AbstractGPUArray}, args...) =
             Base.show_vector(io, $ctor(v, convert_to_cpu), args...)
     end
 end
@@ -75,7 +75,7 @@ collect_to_cpu(xs::AbstractArray) = collect(convert_to_cpu(xs))
 
 for (W, ctor) in (:AT => (A,mut)->mut(A), Adapt.wrappers...)
     @eval begin
-        Base.collect(X::$W where {AT <: GPUArray}) = collect_to_cpu(X)
+        Base.collect(X::$W where {AT <: AbstractGPUArray}) = collect_to_cpu(X)
     end
 end
 
@@ -86,18 +86,18 @@ end
 
 # convert to something we can get a pointer to
 materialize(x::AbstractArray) = Array(x)
-materialize(x::GPUArray) = x
+materialize(x::AbstractGPUArray) = x
 materialize(x::Array) = x
 
-# TODO: do we want to support `copyto(..., WrappedArray{GPUArray})`
+# TODO: do we want to support `copyto(..., WrappedArray{AbstractGPUArray})`
 # if so (does not work due to lack of copy constructors):
 #for (W, ctor) in (:AT => (A,mut)->mut(A), Adapt.wrappers...)
 #    @eval begin
-#        materialize(X::$W) where {AT <: GPUArray} = AT(X)
+#        materialize(X::$W) where {AT <: AbstractGPUArray} = AT(X)
 #    end
 #end
 
-for (D, S) in ((GPUArray, AbstractArray), (Array, GPUArray), (GPUArray, GPUArray))
+for (D, S) in ((AbstractGPUArray, AbstractArray), (Array, AbstractGPUArray), (AbstractGPUArray, AbstractGPUArray))
     @eval begin
         function Base.copyto!(dest::$D{T, N}, rdest::NTuple{N, UnitRange},
                               src::$S{T, N}, ssrc::NTuple{N, UnitRange}) where {T, N}
@@ -128,7 +128,7 @@ end
 
 ## generalized blocks of heterogeneous memory
 
-Base.copyto!(dest::GPUArray, src::GPUArray) =
+Base.copyto!(dest::AbstractGPUArray, src::AbstractGPUArray) =
     copyto!(dest, CartesianIndices(dest), src, CartesianIndices(src))
 
 function copy_kernel!(state, dest, dest_offsets, src, src_offsets, shape, shape_dest, shape_source, length)
@@ -143,8 +143,8 @@ function copy_kernel!(state, dest, dest_offsets, src, src_offsets, shape, shape_
     return
 end
 
-function Base.copyto!(dest::GPUArray{T, N}, destcrange::CartesianIndices{N},
-                      src::GPUArray{U, N}, srccrange::CartesianIndices{N}) where {T, U, N}
+function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices{N},
+                      src::AbstractGPUArray{U, N}, srccrange::CartesianIndices{N}) where {T, U, N}
     shape = size(destcrange)
     if shape != size(srccrange)
         throw(DimensionMismatch("Ranges don't match their size. Found: $shape, $(size(srccrange))"))
@@ -159,7 +159,7 @@ function Base.copyto!(dest::GPUArray{T, N}, destcrange::CartesianIndices{N},
     dest
 end
 
-function Base.copyto!(dest::GPUArray{T, N}, destcrange::CartesianIndices{N},
+function Base.copyto!(dest::AbstractGPUArray{T, N}, destcrange::CartesianIndices{N},
                       src::AbstractArray{T, N}, srccrange::CartesianIndices{N}) where {T, N}
     # Is this efficient? Maybe!
     # TODO: compare to a pure intrinsic copyto implementation!
@@ -172,7 +172,7 @@ function Base.copyto!(dest::GPUArray{T, N}, destcrange::CartesianIndices{N},
 end
 
 function Base.copyto!(dest::AbstractArray{T, N}, destcrange::CartesianIndices{N},
-                      src::GPUArray{T, N}, srccrange::CartesianIndices{N}) where {T, N}
+                      src::AbstractGPUArray{T, N}, srccrange::CartesianIndices{N}) where {T, N}
     # Is this efficient? Maybe!
     dest_gpu = similar(src, size(destcrange))
     nrange = CartesianIndices(size(dest_gpu))
@@ -183,9 +183,9 @@ end
 
 ## other
 
-Base.copy(x::GPUArray) = identity.(x)
+Base.copy(x::AbstractGPUArray) = identity.(x)
 
-Base.deepcopy(x::GPUArray) = copy(x)
+Base.deepcopy(x::AbstractGPUArray) = copy(x)
 
 
 # reinterpret
@@ -221,20 +221,20 @@ This makes it easier to do checks just on the high level.
 """
 function unsafe_reinterpret end
 
-function reinterpret(::Type{T}, a::GPUArray{S,1}) where T where S
+function reinterpret(::Type{T}, a::AbstractGPUArray{S,1}) where T where S
     nel = (length(a)*sizeof(S)) ÷ sizeof(T)
     # TODO: maybe check that remainder is zero?
     return reinterpret(T, a, (nel,))
 end
 
-function reinterpret(::Type{T}, a::GPUArray{S}) where T where S
+function reinterpret(::Type{T}, a::AbstractGPUArray{S}) where T where S
     if sizeof(S) != sizeof(T)
         throw(ArgumentError("result shape not specified"))
     end
     reinterpret(T, a, size(a))
 end
 
-function reinterpret(::Type{T}, a::GPUArray{S}, dims::NTuple{N, Integer}) where T where S where N
+function reinterpret(::Type{T}, a::AbstractGPUArray{S}, dims::NTuple{N, Integer}) where T where S where N
     if !isbitstype(T)
         throw(ArgumentError("cannot reinterpret Array{$(S)} to ::Type{Array{$(T)}}, type $(T) is not a bits type"))
     end
@@ -248,13 +248,13 @@ function reinterpret(::Type{T}, a::GPUArray{S}, dims::NTuple{N, Integer}) where
     unsafe_reinterpret(T, a, dims)
 end
 
-function Base._reshape(A::GPUArray{T}, dims::Dims) where T
+function Base._reshape(A::AbstractGPUArray{T}, dims::Dims) where T
     n = length(A)
     prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
     return unsafe_reinterpret(T, A, dims)
 end
 #ambig
-function Base._reshape(A::GPUArray{T, 1}, dims::Tuple{Integer}) where T
+function Base._reshape(A::AbstractGPUArray{T, 1}, dims::Tuple{Integer}) where T
     n = Base._length(A)
     prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
     return unsafe_reinterpret(T, A, dims)
@@ -266,4 +266,4 @@ end
 # TODO: filter!
 
 # revert of JuliaLang/julia#31929
-Base.filter(f, As::GPUArray) = As[map(f, As)::GPUArray{Bool}]
+Base.filter(f, As::AbstractGPUArray) = As[map(f, As)::AbstractGPUArray{Bool}]
@@ -2,22 +2,22 @@
 
 allequal(x) = true
 allequal(x, y, z...) = x == y && allequal(y, z...)
-function Base.map!(f, y::GPUArray, xs::GPUArray...)
+function Base.map!(f, y::AbstractGPUArray, xs::AbstractGPUArray...)
     @assert allequal(size.((y, xs...))...)
     return y .= f.(xs...)
 end
-function Base.map(f, y::GPUArray, xs::GPUArray...)
+function Base.map(f, y::AbstractGPUArray, xs::AbstractGPUArray...)
     @assert allequal(size.((y, xs...))...)
     return f.(y, xs...)
 end
 
 # Break ambiguities with base
-Base.map!(f, y::GPUArray) =
-    invoke(map!, Tuple{Any,GPUArray,Vararg{GPUArray}}, f, y)
-Base.map!(f, y::GPUArray, x::GPUArray) =
-    invoke(map!, Tuple{Any,GPUArray, Vararg{GPUArray}}, f, y, x)
-Base.map!(f, y::GPUArray, x1::GPUArray, x2::GPUArray) =
-    invoke(map!, Tuple{Any,GPUArray, Vararg{GPUArray}}, f, y, x1, x2)
+Base.map!(f, y::AbstractGPUArray) =
+    invoke(map!, Tuple{Any,AbstractGPUArray,Vararg{AbstractGPUArray}}, f, y)
+Base.map!(f, y::AbstractGPUArray, x::AbstractGPUArray) =
+    invoke(map!, Tuple{Any,AbstractGPUArray, Vararg{AbstractGPUArray}}, f, y, x)
+Base.map!(f, y::AbstractGPUArray, x1::AbstractGPUArray, x2::AbstractGPUArray) =
+    invoke(map!, Tuple{Any,AbstractGPUArray, Vararg{AbstractGPUArray}}, f, y, x1, x2)
 
 
 # Base functions that are sadly not fit for the the GPU yet (they only work for Int64)
 
@@ -11,34 +11,34 @@ import Base.Broadcast: BroadcastStyle, Broadcasted, ArrayStyle
 # TODO: investigate if we should define out own `GPUArrayStyle{N} <: AbstractArrayStyle{N}`
 #
 # NOTE: this uses the specific `T` that was used e.g. `JLArray` or `CLArray` for ArrayStyle,
-#       instead of using `ArrayStyle{GPUArray}`, due to the fact how `similar` works.
-BroadcastStyle(::Type{T}) where {T<:GPUArray} = ArrayStyle{T}()
+#       instead of using `ArrayStyle{AbstractGPUArray}`, due to the fact how `similar` works.
+BroadcastStyle(::Type{T}) where {T<:AbstractGPUArray} = ArrayStyle{T}()
 
 # Wrapper types otherwise forget that they are GPU compatible
 #
-# NOTE: Don't directly use ArrayStyle{GPUArray} here since that would mean that `CuArrays`
+# NOTE: Don't directly use ArrayStyle{AbstractGPUArray} here since that would mean that `CuArrays`
 #       customization no longer take effect.
 for (W, ctor) in Adapt.wrappers
   @eval begin
-    BroadcastStyle(::Type{<:$W}) where {AT<:GPUArray} = BroadcastStyle(AT)
-    backend(::Type{<:$W}) where {AT<:GPUArray} = backend(AT)
+    BroadcastStyle(::Type{<:$W}) where {AT<:AbstractGPUArray} = BroadcastStyle(AT)
+    backend(::Type{<:$W}) where {AT<:AbstractGPUArray} = backend(AT)
   end
 end
 
 # This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray
-# and we could define our methods in terms of Union{GPUArray, WrappedArray{<:Any, <:GPUArray}}
+# and we could define our methods in terms of Union{AbstractGPUArray, WrappedArray{<:Any, <:AbstractGPUArray}}
 @eval const GPUDestArray =
-  Union{GPUArray, $((:($W where {AT <: GPUArray}) for (W, _) in Adapt.wrappers)...)}
+  Union{AbstractGPUArray, $((:($W where {AT <: AbstractGPUArray}) for (W, _) in Adapt.wrappers)...)}
 
 # We purposefully only specialize `copyto!`, dependent packages need to make sure that they
 # can handle:
 # - `bc::Broadcast.Broadcasted{Style}`
 # - `ex::Broadcast.Extruded`
-# - `LinearAlgebra.Transpose{,<:GPUArray}` and `LinearAlgebra.Adjoint{,<:GPUArray}`, etc
+# - `LinearAlgebra.Transpose{,<:AbstractGPUArray}` and `LinearAlgebra.Adjoint{,<:AbstractGPUArray}`, etc
 #    as arguments to a kernel and that they do the right conversion.
 #
 # This Broadcast can be further customize by:
-# - `Broadcast.preprocess(dest::GPUArray, bc::Broadcasted{Nothing})` which allows for a
+# - `Broadcast.preprocess(dest::AbstractGPUArray, bc::Broadcasted{Nothing})` which allows for a
 #   complete transformation based on the output type just at the end of the pipeline.
 # - `Broadcast.broadcasted(::Style, f)` selection of an implementation of `f` compatible
 #   with `Style`