JuliaGPU
diff --git a/‎src/KernelAbstractions.jl
Lines changed: 28 additions & 33 deletions b/‎src/KernelAbstractions.jl
Lines changed: 28 additions & 33 deletions
diff --git a/‎src/backends/cpu.jl
Lines changed: 46 additions & 42 deletions b/‎src/backends/cpu.jl
Lines changed: 46 additions & 42 deletions
diff --git a/‎src/backends/cuda.jl
Lines changed: 34 additions & 60 deletions b/‎src/backends/cuda.jl
Lines changed: 34 additions & 60 deletions
@@ -76,7 +76,9 @@ function async_copy! end
 """
     groupsize()
 
-Query the workgroupsize on the device.
+Query the workgroupsize on the device. This function returns
+a tuple corresponding to kernel configuration. In order to get
+the total size you can use `prod(groupsize())`.
 """
 function groupsize end
 
@@ -131,10 +133,6 @@ macro index(locale, args...)
         indexkind = :Linear
     end
 
-    if indexkind === :Cartesian && locale === :Local
-        error("@index(Local, Cartesian) is not implemented yet") 
-    end
-    
     index_function = Symbol(:__index_, locale, :_, indexkind)
     Expr(:call, GlobalRef(KernelAbstractions, index_function), map(esc, args)...)
 end
@@ -189,14 +187,7 @@ end
 workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
 ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange
 
-"""
-    partition(kernel, ndrange)
-
-Splits the maximum size of the iteration space by the workgroupsize.
-Returns the number of workgroups necessary and whether the last workgroup
-needs to perform dynamic bounds-checking.
-"""
-@inline function partition(kernel::Kernel, ndrange, workgroupsize)
+function partition(kernel, ndrange, workgroupsize)
     static_ndrange = KernelAbstractions.ndrange(kernel)
     static_workgroupsize = KernelAbstractions.workgroupsize(kernel)
 
@@ -208,42 +199,49 @@ needs to perform dynamic bounds-checking.
             You created a dynamically sized kernel, but forgot to provide runtime
             parameters for the kernel. Either provide them statically if known
             or dynamically.
-            NDRange(Static):  $(typeof(static_ndrange))
+            NDRange(Static):  $(static_ndrange)
             NDRange(Dynamic): $(ndrange)
-            Workgroupsize(Static):  $(typeof(static_workgroupsize))
+            Workgroupsize(Static):  $(static_workgroupsize)
             Workgroupsize(Dynamic): $(workgroupsize)
         """
         error(errmsg)
     end
 
-    if ndrange !== nothing && static_ndrange <: StaticSize
-        if prod(ndrange) != prod(get(static_ndrange))
+    if static_ndrange <: StaticSize
+        if ndrange !== nothing && ndrange != get(static_ndrange)
             error("Static NDRange and launch NDRange differ")
         end
+        ndrange = get(static_ndrange)
     end
 
     if static_workgroupsize <: StaticSize
-        @assert length(get(static_workgroupsize)) === 1
-        static_workgroupsize = get(static_workgroupsize)[1]
-        if workgroupsize !== nothing && workgroupsize != static_workgroupsize
+        if workgroupsize !== nothing && workgroupsize != get(static_workgroupsize)
             error("Static WorkgroupSize and launch WorkgroupSize differ")
         end
-        workgroupsize = static_workgroupsize
+        workgroupsize = get(static_workgroupsize)
     end
+
     @assert workgroupsize !== nothing
+    @assert ndrange !== nothing
+    blocks, workgroupsize, dynamic = NDIteration.partition(ndrange, workgroupsize)
 
     if static_ndrange <: StaticSize
-        maxsize = prod(get(static_ndrange))
-    else
-        maxsize = prod(ndrange)
+        static_blocks = StaticSize{blocks}
+        blocks = nothing
+    else 
+        static_blocks = DynamicSize
+        blocks = CartesianIndices(blocks)
     end
 
-    nworkgroups = fld1(maxsize, workgroupsize)
-    dynamic     = mod(maxsize, workgroupsize) != 0
-
-    dynamic || @assert(nworkgroups * workgroupsize == maxsize)
+    if static_workgroupsize <: StaticSize
+        static_workgroupsize = StaticSize{workgroupsize} # we might have padded workgroupsize
+        workgroupsize = nothing
+    else
+        workgroupsize = CartesianIndices(workgroupsize)
+    end
 
-    return nworkgroups, dynamic 
+    iterspace = NDRange{length(ndrange), static_blocks, static_workgroupsize}(blocks, workgroupsize)
+    return iterspace, dynamic
 end
 
 ###
@@ -256,10 +254,7 @@ include("compiler.jl")
 # Compiler/Frontend
 ###
 
-@inline function __workitems_iterspace()
-    return 1:groupsize()
-end
-
+function __workitems_iterspace end
 function __validindex end
 
 include("macros.jl")
 
@@ -14,86 +14,90 @@ function wait(ev::CPUEvent, progress=nothing)
 end
 
 function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, dependencies=nothing)
-    if ndrange isa Int
+    if ndrange isa Integer
         ndrange = (ndrange,)
     end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
     if dependencies isa Event
         dependencies = (dependencies,)
     end
+
     if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
-        workgroupsize = 1024 # Vectorization, 4x unrolling, minimal grain size
+        workgroupsize = (1024,) # Vectorization, 4x unrolling, minimal grain size
     end
-    nblocks, dynamic = partition(obj, ndrange, workgroupsize)
+    iterspace, dynamic = partition(obj, ndrange, workgroupsize)
     # partition checked that the ndrange's agreed
     if KernelAbstractions.ndrange(obj) <: StaticSize
         ndrange = nothing
     end
-    if KernelAbstractions.workgroupsize(obj) <: StaticSize
-        workgroupsize = nothing
-    end
-    t = Threads.@spawn begin
+
+    t = __run(obj, ndrange, iterspace, args, dependencies)
+    return CPUEvent(t)
+end
+
+# Inference barrier
+function __run(obj, ndrange, iterspace, args, dependencies)
+    return Threads.@spawn begin
         if dependencies !== nothing
             Base.sync_end(map(e->e.task, dependencies))
         end
         @sync begin
-            for I in 1:(nblocks-1)
-                let ctx = mkcontext(obj, I, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+            # TODO: how do we use the information that the iteration space maps perfectly to
+            #       the ndrange without incurring a 2x compilation overhead
+            # if dynamic
+                for block in iterspace
+                    let ctx = mkcontextdynamic(obj, block, ndrange, iterspace)
+                        Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+                    end
                 end
-            end
-
-            if dynamic
-                let ctx = mkcontextdynamic(obj, nblocks, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
-                end
-            else 
-                let ctx = mkcontext(obj, nblocks, ndrange, workgroupsize)
-                    Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
-                end
-            end
+            # else
+            #     for block in iterspace
+            #         let ctx = mkcontext(obj, blocks, ndrange, iterspace)
+            #             Threads.@spawn Cassette.overdub(ctx, obj.f, args...)
+            #         end
+            #     end
+            # end
         end
     end
-    return CPUEvent(t)
 end
 
 Cassette.@context CPUCtx
 
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, _workgroupsize)
-    metadata = CompilerMetadata{workgroupsize(kernel), ndrange(kernel), false}(I, _ndrange, _workgroupsize)
+function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace)
+    metadata = CompilerMetadata{ndrange(kernel), false}(I, _ndrange, iterspace)
     Cassette.disablehooks(CPUCtx(pass = CompilerPass, metadata=metadata))
 end
 
-function mkcontextdynamic(kernel::Kernel{CPU}, I, _ndrange, _workgroupsize)
-    metadata = CompilerMetadata{workgroupsize(kernel), ndrange(kernel), true}(I, _ndrange, _workgroupsize)
+function mkcontextdynamic(kernel::Kernel{CPU}, I, _ndrange, iterspace)
+    metadata = CompilerMetadata{ndrange(kernel), true}(I, _ndrange, iterspace)
     Cassette.disablehooks(CPUCtx(pass = CompilerPass, metadata=metadata))
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Linear), idx)
-    return idx
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Linear), idx::CartesianIndex)
+    indices = workitems(__iterspace(ctx.metadata))
+    return @inbounds LinearIndices(indices)[idx]
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Linear), idx)
-    workgroup = __groupindex(ctx.metadata)
-    (workgroup - 1) * __groupsize(ctx.metadata) + idx
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Linear), idx::CartesianIndex)
+    I = @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
+    @inbounds LinearIndices(__ndrange(ctx.metadata))[I]
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Cartesian), idx)
-    error("@index(Local, Cartesian) is not yet defined")
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Local_Cartesian), idx::CartesianIndex)
+    return idx
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Cartesian), idx)
-    workgroup = __groupindex(ctx.metadata)
-    indices = __ndrange(ctx.metadata)
-    lI = (workgroup - 1) * __groupsize(ctx.metadata) + idx
-    return @inbounds indices[lI]
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__index_Global_Cartesian), idx::CartesianIndex)
+    return @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
 end
 
-@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__validindex), idx)
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(__validindex), idx::CartesianIndex)
     # Turns this into a noop for code where we can turn of checkbounds of
     if __dynamic_checkbounds(ctx.metadata)
-        maxidx = prod(size(__ndrange(ctx.metadata)))
-        valid  = idx <= mod1(maxidx, __groupsize(ctx.metadata)) 
-        return valid
+        I = @inbounds expand(__iterspace(ctx.metadata), __groupindex(ctx.metadata), idx)
+        return I in __ndrange(ctx.metadata)
     else
         return true
     end
 
@@ -59,9 +59,12 @@ function wait(ev::CudaEvent, progress=nothing)
 end
 
 function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, workgroupsize=nothing)
-    if ndrange isa Int
+    if ndrange isa Integer
         ndrange = (ndrange,)
     end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
     if dependencies isa Event
         dependencies = (dependencies,)
     end
@@ -74,92 +77,63 @@ function (obj::Kernel{CUDA})(args...; ndrange=nothing, dependencies=nothing, wor
         end
     end
 
-    event = CuEvent(CUDAdrv.EVENT_DISABLE_TIMING)
-
-    # Launch kernel
-    ctx = mkcontext(obj, ndrange)
-    args = (ctx, obj.f, args...)
-    GC.@preserve args begin
-        kernel_args = map(CUDAnative.cudaconvert, args)
-        kernel_tt = Tuple{map(Core.Typeof, kernel_args)...}
-
-        # If the kernel is statically sized we can tell the compiler about that
-        if KernelAbstractions.workgroupsize(obj) <: StaticSize 
-            static_workgroupsize = get(KernelAbstractions.workgroupsize(obj))[1]
-        else
-            static_workgroupsize = nothing
-        end
+    if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
+        # TODO: allow for NDRange{1, DynamicSize, DynamicSize}(nothing, nothing)
+        #       and actually use CUDAnative autotuning
+        workgroupsize = (256,)
+    end
+    # If the kernel is statically sized we can tell the compiler about that
+    if KernelAbstractions.workgroupsize(obj) <: StaticSize 
+        maxthreads = prod(get(KernelAbstractions.workgroupsize(obj)))
+    else
+        maxthreads = nothing
+    end
 
-        kernel = CUDAnative.cufunction(Cassette.overdub, kernel_tt; name=String(nameof(obj.f)), maxthreads=static_workgroupsize)
+    iterspace, dynamic = partition(obj, ndrange, workgroupsize)
 
-        # Dynamically sized and size not prescribed, use autotuning
-        if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
-            workgroupsize = CUDAnative.maxthreads(kernel)
-        end
+    nblocks = length(blocks(iterspace))
+    threads = length(workitems(iterspace))
 
-        if workgroupsize === nothing
-            threads = static_workgroupsize
-        else
-            threads = workgroupsize
-        end
-        @assert threads !== nothing
+    ctx = mkcontext(obj, ndrange, iterspace)
+    # Launch kernel
+    event = CuEvent(CUDAdrv.EVENT_DISABLE_TIMING)
+    CUDAnative.@cuda(threads=threads, blocks=nblocks, stream=stream,
+                     name=String(nameof(obj.f)), maxthreads=maxthreads,
+                     Cassette.overdub(ctx, obj.f, args...))
 
-        blocks, _ = partition(obj, ndrange, threads)
-        kernel(kernel_args..., threads=threads, blocks=blocks, stream=stream)
-    end
     CUDAdrv.record(event, stream)
     return CudaEvent(event)
 end
 
 Cassette.@context CUDACtx
 
-function mkcontext(kernel::Kernel{CUDA}, _ndrange)
-    metadata = CompilerMetadata{workgroupsize(kernel), ndrange(kernel), true}(_ndrange)
+function mkcontext(kernel::Kernel{CUDA}, _ndrange, iterspace)
+    metadata = CompilerMetadata{ndrange(kernel), true}(_ndrange, iterspace)
     Cassette.disablehooks(CUDACtx(pass = CompilerPass, metadata=metadata))
 end
 
-
-@inline function __gpu_groupsize(::CompilerMetadata{WorkgroupSize}) where {WorkgroupSize<:DynamicSize}
-    CUDAnative.blockDim().x
-end
-
-@inline function __gpu_groupsize(cm::CompilerMetadata{WorkgroupSize}) where {WorkgroupSize<:StaticSize}
-    __groupsize(cm)
-end
-
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Local_Linear))
-    idx = CUDAnative.threadIdx().x
-    return idx
+    return CUDAnative.threadIdx().x
 end
 
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Global_Linear))
-    idx = CUDAnative.threadIdx().x
-    workgroup = CUDAnative.blockIdx().x
-    # XXX: have a verify mode where we check that our static dimensions are right
-    #      e.g. that blockDim().x === __groupsize(ctx.metadata)
-    return (workgroup - 1) * __gpu_groupsize(ctx.metadata) + idx
+    I =  @inbounds expand(__iterspace(ctx.metadata), CUDAnative.blockIdx().x, CUDAnative.threadIdx().x)
+    # TODO: This is unfortunate, can we get the linear index cheaper
+    @inbounds LinearIndices(__ndrange(ctx.metadata))[I]
 end
 
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Local_Cartesian))
-    error("@index(Local, Cartesian) is not yet defined")
+    @inbounds workitems(__iterspace(ctx.metadata))[CUDAnative.threadIdx().x]
 end
 
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Global_Cartesian))
-    idx = CUDAnative.threadIdx().x
-    workgroup = CUDAnative.blockIdx().x
-    lI = (workgroup - 1) * __gpu_groupsize(ctx.metadata) + idx
-
-    indices = __ndrange(ctx.metadata) 
-    return @inbounds indices[lI]
+    return @inbounds expand(__iterspace(ctx.metadata), CUDAnative.blockIdx().x, CUDAnative.threadIdx().x)
 end
 
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__validindex))
     if __dynamic_checkbounds(ctx.metadata)
-        idx = CUDAnative.threadIdx().x
-        workgroup = CUDAnative.blockIdx().x
-        lI = (workgroup - 1) * __gpu_groupsize(ctx.metadata) + idx
-        maxidx = prod(size(__ndrange(ctx.metadata)))
-        return lI <= maxidx
+        I = @inbounds expand(__iterspace(ctx.metadata), CUDAnative.blockIdx().x, CUDAnative.threadIdx().x)
+        return I in __ndrange(ctx.metadata)
     else
         return true
     end