JuliaGPU
diff --git a/‎docs/src/interface.md
Lines changed: 33 additions & 55 deletions b/‎docs/src/interface.md
Lines changed: 33 additions & 55 deletions
diff --git a/‎src/GPUArrays.jl
Lines changed: 8 additions & 6 deletions b/‎src/GPUArrays.jl
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/device/abstractarray.jl
Lines changed: 2 additions & 28 deletions b/‎src/device/abstractarray.jl
Lines changed: 2 additions & 28 deletions
diff --git a/‎src/device/device.jl
Lines changed: 17 additions & 0 deletions b/‎src/device/device.jl
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/device/execution.jl
Lines changed: 81 additions & 0 deletions b/‎src/device/execution.jl
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/device/indexing.jl
Lines changed: 16 additions & 16 deletions b/‎src/device/indexing.jl
Lines changed: 16 additions & 16 deletions
diff --git a/‎src/device/memory.jl
Lines changed: 29 additions & 0 deletions b/‎src/device/memory.jl
Lines changed: 29 additions & 0 deletions
@@ -5,83 +5,61 @@ implement the interfaces listed on this page. GPUArrays is design around having
 different array types to represent a GPU array: one that only ever lives on the host, and
 one that actually can be instantiated on the device (i.e. in kernels).
 
-## Host-side
 
-Your host-side array type should build on the `AbstractGPUArray` supertype:
+## Device functionality
 
-```@docs
-AbstractGPUArray
-```
-
-First of all, you should implement operations that are expected to be defined for any
-`AbstractArray` type. Refer to the Julia manual for more details, or look at the `JLArray`
-reference implementation.
-
-To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you
-should provide implementations of the following interfaces:
+Several types and interfaces are related to the device and execution of code on it. First of
+all, you need to provide a type that represents your device and exposes some properties of
+it:
 
 ```@docs
-GPUArrays.unsafe_reinterpret
-```
-
-### Devices
-
-```@docs
-GPUArrays.device
-GPUArrays.synchronize
+GPUArrays.AbstractGPUDevice
+GPUArrays.threads
 ```
 
-### Execution
+Another important set of interfaces relates to executing code on the device:
 
 ```@docs
 GPUArrays.AbstractGPUBackend
-GPUArrays.backend
-```
-
-```@docs
-GPUArrays._gpu_call
+GPUArrays.AbstractKernelContext
+GPUArrays.gpu_call
+GPUArrays.synchronize
+GPUArrays.thread_block_heuristic
 ```
 
-### Linear algebra
+Finally, you need to provide implementations of certain methods that will be executed on the
+device itself:
 
 ```@docs
-GPUArrays.blas_module
-GPUArrays.blasbuffer
+GPUArrays.AbstractDeviceArray
+GPUArrays.LocalMemory
+GPUArrays.synchronize_threads
+GPUArrays.blockidx
+GPUArrays.blockdim
+GPUArrays.threadidx
+GPUArrays.griddim
 ```
 
 
-## Device-side
+## Host abstractions
 
-To work with GPU memory on the device itself, e.g. within a kernel, we need a different
-type: Most functionality will behave differently when running on the GPU, e.g., accessing
-memory directly instead of copying it to the host. We should also take care not to call into
-any host library, such as the Julia runtime or the system's math library.
+You should provide an array type that builds on the `AbstractGPUArray` supertype:
 
 ```@docs
-AbstractDeviceArray
+AbstractGPUArray
 ```
 
-Your device array type should again implement the core elements of the `AbstractArray`
-interface, such as indexing and certain getters. Refer to the Julia manual for more details,
-or look at the `JLDeviceArray` reference implementation.
+First of all, you should implement operations that are expected to be defined for any
+`AbstractArray` type. Refer to the Julia manual for more details, or look at the `JLArray`
+reference implementation.
 
-You should also provide implementations of several "GPU intrinsics". To make sure the
-correct implementation is called, the first argument to these intrinsics will be the kernel
-state object from before.
+To be able to actually use the functionality that is defined for `AbstractGPUArray`s, you
+should provide implementations of the following interfaces:
 
 ```@docs
-GPUArrays.LocalMemory
-GPUArrays.synchronize_threads
-GPUArrays.blockidx_x
-GPUArrays.blockidx_y
-GPUArrays.blockidx_z
-GPUArrays.blockdim_x
-GPUArrays.blockdim_y
-GPUArrays.blockdim_z
-GPUArrays.threadidx_x
-GPUArrays.threadidx_y
-GPUArrays.threadidx_z
-GPUArrays.griddim_x
-GPUArrays.griddim_y
-GPUArrays.griddim_z
+GPUArrays.backend
+GPUArrays.device
+GPUArrays.unsafe_reinterpret
+GPUArrays.blas_module
+GPUArrays.blasbuffer
 ```
@@ -12,17 +12,19 @@ using AbstractFFTs
 
 using Adapt
 
-# device array
+# device functionality
+include("device/device.jl")
+include("device/execution.jl")
+## executed on-device
 include("device/abstractarray.jl")
 include("device/indexing.jl")
+include("device/memory.jl")
 include("device/synchronization.jl")
 
-# host array
+# host abstractions
 include("host/abstractarray.jl")
-include("host/devices.jl")
-include("host/execution.jl")
 include("host/construction.jl")
-## integrations and specialized functionality
+## integrations and specialized methods
 include("host/base.jl")
 include("host/indexing.jl")
 include("host/broadcast.jl")
@@ -32,7 +34,7 @@ include("host/random.jl")
 include("host/quirks.jl")
 
 # CPU reference implementation
-include("array.jl")
+include("reference.jl")
 
 
 end # module
@@ -1,6 +1,6 @@
-# on-device functionality
+# on-device array type
 
-export AbstractDeviceArray, @LocalMemory
+export AbstractDeviceArray
 
 
 ## device array
@@ -31,29 +31,3 @@ function Base.sum(A::AbstractDeviceArray{T}) where T
     end
     acc
 end
-
-
-## thread-local array
-
-const shmem_counter = Ref{Int}(0)
-
-"""
-Creates a local static memory shared inside one block.
-Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
-"""
-macro LocalMemory(state, T, N)
-    id = (shmem_counter[] += 1)
-    quote
-        LocalMemory($(esc(state)), $(esc(T)), Val($(esc(N))), Val($id))
-    end
-end
-
-"""
-Creates a block local array pointer with `T` being the element type
-and `N` the length. Both T and N need to be static! C is a counter for
-approriately get the correct Local mem id in CUDAnative.
-This is an internal method which needs to be overloaded by the GPU Array backends
-"""
-function LocalMemory(state, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
-    error("Not implemented") # COV_EXCL_LINE
-end
@@ -0,0 +1,17 @@
+# device management and properties
+
+export AbstractGPUDevice
+
+abstract type AbstractGPUDevice end
+
+"""
+    device(A::AbstractArray)
+
+Gets the device associated to the Array `A`
+"""
+device(A::AbstractArray) = error("This array is not a GPU array") # COV_EXCL_LINE
+
+"""
+Hardware threads of device
+"""
+threads(::AbstractGPUDevice) = error("Not implemented") # COV_EXCL_LINE
@@ -0,0 +1,81 @@
+# kernel execution
+
+export AbstractGPUBackend, AbstractKernelContext, gpu_call, synchronize, thread_blocks_heuristic
+
+abstract type AbstractGPUBackend end
+
+abstract type AbstractKernelContext end
+
+"""
+    backend(T::Type)
+    backend(x)
+
+Gets the GPUArrays back-end responsible for managing arrays of type `T`.
+"""
+backend(::Type) = error("This object is not a GPU array") # COV_EXCL_LINE
+backend(x) = backend(typeof(x))
+
+"""
+    gpu_call(kernel::Function, arg0, args...; kwargs...)
+
+Executes `kernel` on the device that backs `arg` (see [`backend`](@ref)), passing along any
+arguments `args`. Additionally, the kernel will be passed the kernel execution context (see
+[`AbstractKernelContext`]), so its signature should be `(ctx::AbstractKernelContext, arg0,
+args...)`.
+
+The keyword arguments `kwargs` are not passed to the function, but are interpreted on the
+host to influence how the kernel is executed. The following keyword arguments are supported:
+
+- `target::AbstractArray`: specify which array object to use for determining execution
+  properties (defaults to the first argument `arg0`).
+- `total_threads::Int`: how many threads should be launched _in total_. The actual number of
+  threads and blocks is determined using a heuristic. Defaults to the length of `arg0` if
+  no other keyword arguments that influence the launch configuration are specified.
+- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
+  launched. This cannot be used in combination with the `total_threads` argument.
+"""
+function gpu_call(kernel::Base.Callable, args...;
+                  target::AbstractArray=first(args),
+                  total_threads::Union{Int,Nothing}=nothing,
+                  threads::Union{Int,Nothing}=nothing,
+                  blocks::Union{Int,Nothing}=nothing,
+                  kwargs...)
+    # determine how many threads/blocks to launch
+    if total_threads===nothing && threads===nothing && blocks===nothing
+        total_threads = length(target)
+    end
+    if total_threads !== nothing
+        if threads !== nothing || blocks !== nothing
+            error("Cannot specify both total_threads and threads/blocks configuration")
+        end
+        blocks, threads = thread_blocks_heuristic(total_threads)
+    else
+        if threads === nothing
+            threads = 1
+        end
+        if blocks === nothing
+            blocks = 1
+        end
+    end
+
+    gpu_call(backend(target), kernel, args...; threads=threads, blocks=blocks, kwargs...)
+end
+
+gpu_call(backend::AbstractGPUBackend, kernel, args...; kwargs...) = error("Not implemented") # COV_EXCL_LINE
+
+"""
+    synchronize(A::AbstractArray)
+
+Blocks until all operations are finished on `A`
+"""
+function synchronize(A::AbstractArray)
+    # fallback is a noop, for backends not needing synchronization. This
+    # makes it easier to write generic code that also works for AbstractArrays
+end
+
+function thread_blocks_heuristic(len::Integer)
+    # TODO better threads default
+    threads = clamp(len, 1, 256)
+    blocks = max(ceil(Int, len / threads), 1)
+    (blocks, threads)
+end
@@ -5,63 +5,63 @@ export global_size, synchronize_threads, linear_index
 
 # thread indexing functions
 for f in (:blockidx, :blockdim, :threadidx, :griddim)
-    @eval $f(state)::Int = error("Not implemented") # COV_EXCL_LINE
+    @eval $f(ctx::AbstractKernelContext)::Int = error("Not implemented") # COV_EXCL_LINE
     @eval export $f
 end
 
 """
-    global_size(state)
+    global_size(ctx::AbstractKernelContext)
 
 Global size == blockdim * griddim == total number of kernel execution
 """
-@inline function global_size(state)
-    griddim(state) * blockdim(state)
+@inline function global_size(ctx::AbstractKernelContext)
+    griddim(ctx) * blockdim(ctx)
 end
 
 """
-    linear_index(state)
+    linear_index(ctx::AbstractKernelContext)
 
 linear index corresponding to each kernel launch (in OpenCL equal to get_global_id).
 
 """
-@inline function linear_index(state)
-    (blockidx(state) - 1) * blockdim(state) + threadidx(state)
+@inline function linear_index(ctx::AbstractKernelContext)
+    (blockidx(ctx) - 1) * blockdim(ctx) + threadidx(ctx)
 end
 
 """
-    linearidx(A, statesym = :state)
+    linearidx(A, ctxsym = :ctx)
 
 Macro form of `linear_index`, which calls return when out of bounds.
 So it can be used like this:
 
     ```julia
-    function kernel(state, A)
-        idx = @linear_index A state
+    function kernel(ctx::AbstractKernelContext, A)
+        idx = @linear_index A ctx
         # from here on it's save to index into A with idx
         @inbounds begin
             A[idx] = ...
         end
     end
     ```
 """
-macro linearidx(A, statesym = :state)
+macro linearidx(A, ctxsym = :ctx)
     quote
         x1 = $(esc(A))
-        i1 = linear_index($(esc(statesym)))
+        i1 = linear_index($(esc(ctxsym)))
         i1 > length(x1) && return
         i1
     end
 end
 
 """
-    cartesianidx(A, statesym = :state)
+    cartesianidx(A, ctxsym = :ctx)
 
-Like [`@linearidx(A, statesym = :state)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
+Like [`@linearidx(A, ctxsym = :ctx)`](@ref), but returns an N-dimensional `NTuple{ndim(A), Int}` as index
 """
-macro cartesianidx(A, statesym = :state)
+macro cartesianidx(A, ctxsym = :ctx)
     quote
         x = $(esc(A))
-        i2 = @linearidx(x, $(esc(statesym)))
+        i2 = @linearidx(x, $(esc(ctxsym)))
         gpu_ind2sub(x, i2)
     end
 end
@@ -0,0 +1,29 @@
+# on-device memory management
+
+export @LocalMemory
+
+
+## thread-local array
+
+const shmem_counter = Ref{Int}(0)
+
+"""
+Creates a local static memory shared inside one block.
+Equivalent to `__local` of OpenCL or `__shared__ (<variable>)` of CUDA.
+"""
+macro LocalMemory(ctx, T, N)
+    id = (shmem_counter[] += 1)
+    quote
+        LocalMemory($(esc(ctx)), $(esc(T)), Val($(esc(N))), Val($id))
+    end
+end
+
+"""
+Creates a block local array pointer with `T` being the element type
+and `N` the length. Both T and N need to be static! C is a counter for
+approriately get the correct Local mem id in CUDAnative.
+This is an internal method which needs to be overloaded by the GPU Array backends
+"""
+function LocalMemory(ctx, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id}
+    error("Not implemented") # COV_EXCL_LINE
+end