JuliaGPU
diff --git a/‎examples/performance.jl
Lines changed: 8 additions & 8 deletions b/‎examples/performance.jl
Lines changed: 8 additions & 8 deletions
diff --git a/‎examples/performant_matmul.jl
Lines changed: 1 addition & 1 deletion b/‎examples/performant_matmul.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/CUDAKernels/src/CUDAKernels.jl
Lines changed: 25 additions & 23 deletions b/‎lib/CUDAKernels/src/CUDAKernels.jl
Lines changed: 25 additions & 23 deletions
diff --git a/‎lib/ROCKernels/src/ROCKernels.jl
Lines changed: 23 additions & 22 deletions b/‎lib/ROCKernels/src/ROCKernels.jl
Lines changed: 23 additions & 22 deletions
diff --git a/‎src/KernelAbstractions.jl
Lines changed: 15 additions & 9 deletions b/‎src/KernelAbstractions.jl
Lines changed: 15 additions & 9 deletions
diff --git a/‎src/compiler.jl
Lines changed: 2 additions & 2 deletions b/‎src/compiler.jl
Lines changed: 2 additions & 2 deletions
@@ -30,8 +30,8 @@ end
     I, J = @index(Global, NTuple)
     i, j = @index(Local,  NTuple)
 
-    N = @uniform groupsize()[1]
-    M = @uniform groupsize()[2]
+    N = @uniform @groupsize()[1]
+    M = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
     tile = @localmem eltype(output) (N+BANK, M)
@@ -48,8 +48,8 @@ end
     gi, gj = @index(Group, NTuple)
     i, j = @index(Local,  NTuple)
 
-    N = @uniform groupsize()[1]
-    M = @uniform groupsize()[2]
+    N = @uniform @groupsize()[1]
+    M = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
     tile = @localmem eltype(output) (N+BANK, M) 
@@ -77,8 +77,8 @@ end
     gi, gj = @index(Group, NTuple)
     i, j   = @index(Local, NTuple)
 
-    TILE_DIM   = @uniform groupsize()[1]
-    BLOCK_ROWS = @uniform groupsize()[2]
+    TILE_DIM   = @uniform @groupsize()[1]
+    BLOCK_ROWS = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
     tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
@@ -103,8 +103,8 @@ end
     gi, gj = @index(Group, NTuple)
     i, j   = @index(Local, NTuple)
 
-    TILE_DIM   = @uniform groupsize()[1]
-    BLOCK_ROWS = @uniform groupsize()[2]
+    TILE_DIM   = @uniform @groupsize()[1]
+    BLOCK_ROWS = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
     tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
 
@@ -9,7 +9,7 @@ const TILE_DIM = 32
      gi, gj = @index(Group, NTuple)
      i, j   = @index(Local, NTuple)
 
-     TILE_DIM = @uniform groupsize()[1]
+     TILE_DIM = @uniform @groupsize()[1]
 
      # +1 to avoid bank conflicts on shared memory
      tile1 = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
 
@@ -191,7 +191,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
     ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
     # this might not be the final context, since we may tune the workgroupsize
     ctx = mkcontext(obj, ndrange, iterspace)
-    kernel = CUDA.@cuda launch=false name=String(nameof(obj.f)) Cassette.overdub(ctx, obj.f, args...)
+    kernel = CUDA.@cuda launch=false name=String(nameof(obj.f)) Cassette.overdub(CUDACTX, obj.f, ctx, args...)
 
     # figure out the optimal workgroupsize automatically
     if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
@@ -220,7 +220,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
 
     # Launch kernel
     event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
-    kernel(ctx, obj.f, args...; threads=threads, blocks=nblocks, stream=stream)
+    kernel(CUDACTX, obj.f, ctx, args...; threads=threads, blocks=nblocks, stream=stream)
 
     CUDA.record(event, stream)
     return CudaEvent(event)
@@ -232,41 +232,43 @@ import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearI
 import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
 import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
 
+const CUDACTX = Cassette.disablehooks(CUDACtx(pass = CompilerPass))
+KernelAbstractions.cassette(::Kernel{CUDADevice}) = CUDACTX
+
 function mkcontext(kernel::Kernel{CUDADevice}, _ndrange, iterspace)
-    metadata = CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
-    Cassette.disablehooks(CUDACtx(pass = CompilerPass, metadata=metadata))
+    CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Local_Linear))
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Local_Linear), ctx)
     return CUDA.threadIdx().x
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Group_Linear))
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Group_Linear), ctx)
     return CUDA.blockIdx().x
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Global_Linear))
-    I =  @inbounds expand(__iterspace(ctx.metadata), CUDA.blockIdx().x, CUDA.threadIdx().x)
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Global_Linear), ctx)
+    I =  @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
     # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(__ndrange(ctx.metadata))[I]
+    @inbounds LinearIndices(__ndrange(ctx))[I]
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Local_Cartesian))
-    @inbounds workitems(__iterspace(ctx.metadata))[CUDA.threadIdx().x]
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Local_Cartesian), ctx)
+    @inbounds workitems(__iterspace(ctx))[CUDA.threadIdx().x]
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Group_Cartesian))
-    @inbounds blocks(__iterspace(ctx.metadata))[CUDA.blockIdx().x]
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Group_Cartesian), ctx)
+    @inbounds blocks(__iterspace(ctx))[CUDA.blockIdx().x]
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__index_Global_Cartesian))
-    return @inbounds expand(__iterspace(ctx.metadata), CUDA.blockIdx().x, CUDA.threadIdx().x)
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__index_Global_Cartesian), ctx)
+    return @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__validindex))
-    if __dynamic_checkbounds(ctx.metadata)
-        I = @inbounds expand(__iterspace(ctx.metadata), CUDA.blockIdx().x, CUDA.threadIdx().x)
-        return I in __ndrange(ctx.metadata)
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__validindex), ctx)
+    if __dynamic_checkbounds(ctx)
+        I = @inbounds expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+        return I in __ndrange(ctx)
     else
         return true
     end
@@ -323,7 +325,7 @@ import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize
 # GPU implementation of shared memory
 ###
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@inline function Cassette.overdub(::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = emit_shmem(Val(Id), T, Val(prod(Dims)))
     CUDA.CuDeviceArray(Dims, ptr)
 end
@@ -333,15 +335,15 @@ end
 # - private memory for each workitem
 ###
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(Scratchpad), ::Type{T}, ::Val{Dims}) where {T, Dims}
+@inline function Cassette.overdub(::CUDACtx, ::typeof(Scratchpad), ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
     MArray{__size(Dims), T}(undef)
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__synchronize))
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__synchronize))
     CUDA.sync_threads()
 end
 
-@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(__print), args...)
+@inline function Cassette.overdub(::CUDACtx, ::typeof(__print), args...)
     CUDA._cuprint(args...)
 end
 
 
@@ -204,7 +204,7 @@ function (obj::Kernel{ROCDevice})(args...; ndrange=nothing, dependencies=nothing
     # Launch kernel
     event = AMDGPU.@roc(groupsize=threads, gridsize=nblocks*threads, queue=queue,
                         name=String(nameof(obj.f)), # TODO: maxthreads=maxthreads,
-                        Cassette.overdub(ctx, obj.f, args...))
+                        Cassette.overdub(ROCCTX, obj.f, ctx, args...))
 
     return ROCEvent(event.event)
 end
@@ -215,45 +215,46 @@ import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearI
 import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
 import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
 
+const ROCCTX = Cassette.disablehooks(ROCCtx(pass = CompilerPass))
+KernelAbstractions.cassette(::Kernel{ROCDevice}) = ROCCTX
+
 function mkcontext(kernel::Kernel{ROCDevice}, _ndrange, iterspace)
     metadata = CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
-    Cassette.disablehooks(ROCCtx(pass = CompilerPass, metadata=metadata))
 end
 function mkcontext(kernel::Kernel{ROCDevice}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
     metadata = CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
-    Cassette.disablehooks(ROCCtx(pass = CompilerPass, metadata=metadata))
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Local_Linear))
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Local_Linear), ctx)
     return AMDGPU.threadIdx().x
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Group_Linear))
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Group_Linear), ctx)
     return AMDGPU.blockIdx().x
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Global_Linear))
-    I =  @inbounds expand(__iterspace(ctx.metadata), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Global_Linear), ctx)
+    I =  @inbounds expand(__iterspace(ctx), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
     # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(__ndrange(ctx.metadata))[I]
+    @inbounds LinearIndices(__ndrange(ctx))[I]
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Local_Cartesian))
-    @inbounds workitems(__iterspace(ctx.metadata))[AMDGPU.threadIdx().x]
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Local_Cartesian), ctx)
+    @inbounds workitems(__iterspace(ctx))[AMDGPU.threadIdx().x]
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Group_Cartesian))
-    @inbounds blocks(__iterspace(ctx.metadata))[AMDGPU.blockIdx().x]
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Group_Cartesian), ctx)
+    @inbounds blocks(__iterspace(ctx))[AMDGPU.blockIdx().x]
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__index_Global_Cartesian))
-    return @inbounds expand(__iterspace(ctx.metadata), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__index_Global_Cartesian), ctx)
+    return @inbounds expand(__iterspace(ctx), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__validindex))
-    if __dynamic_checkbounds(ctx.metadata)
-        I = @inbounds expand(__iterspace(ctx.metadata), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
-        return I in __ndrange(ctx.metadata)
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__validindex), ctx)
+    if __dynamic_checkbounds(ctx)
+        I = @inbounds expand(__iterspace(ctx), AMDGPU.blockIdx().x, AMDGPU.threadIdx().x)
+        return I in __ndrange(ctx)
     else
         return true
     end
@@ -305,7 +306,7 @@ import KernelAbstractions: ConstAdaptor, SharedMemory, Scratchpad, __synchronize
 ###
 # GPU implementation of shared memory
 ###
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@inline function Cassette.overdub(::ROCCtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = AMDGPU.alloc_special(Val(Id), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
     AMDGPU.ROCDeviceArray(Dims, ptr)
 end
@@ -315,15 +316,15 @@ end
 # - private memory for each workitem
 ###
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(Scratchpad), ::Type{T}, ::Val{Dims}) where {T, Dims}
+@inline function Cassette.overdub(::ROCCtx, ::typeof(Scratchpad), ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
     MArray{__size(Dims), T}(undef)
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__synchronize))
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__synchronize))
     AMDGPU.sync_workgroup()
 end
 
-@inline function Cassette.overdub(ctx::ROCCtx, ::typeof(__print), args...)
+@inline function Cassette.overdub(::ROCCtx, ::typeof(__print), args...)
     for arg in args
         AMDGPU.@rocprintf("%s", arg)
     end
 
@@ -1,7 +1,7 @@
 module KernelAbstractions
 
 export @kernel
-export @Const, @localmem, @private, @uniform, @synchronize, @index, groupsize, @print
+export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize, @print
 export Device, GPU, CPU, Event, MultiEvent, NoneEvent
 export async_copy!
 
@@ -111,14 +111,20 @@ function async_copy! end
 ###
 
 """
-    groupsize()
+    @groupsize()
 
 Query the workgroupsize on the device. This function returns
 a tuple corresponding to kernel configuration. In order to get
-the total size you can use `prod(groupsize())`.
+the total size you can use `prod(@groupsize())`.
 """
 function groupsize end
 
+macro groupsize()
+    quote
+        $groupsize($(esc(:__ctx__)))
+    end 
+end
+
 """
     @localmem T dims
 
@@ -150,7 +156,7 @@ macro private(T, dims)
         dims = (dims,)
     end
     quote
-        $Scratchpad($(esc(T)), Val($(esc(dims))))
+        $Scratchpad($(esc(:__ctx__)), $(esc(T)), Val($(esc(dims))))
     end
 end
 
@@ -297,7 +303,7 @@ macro index(locale, args...)
     end
 
     index_function = Symbol(:__index_, locale, :_, indexkind)
-    Expr(:call, GlobalRef(KernelAbstractions, index_function), map(esc, args)...)
+    Expr(:call, GlobalRef(KernelAbstractions, index_function), esc(:__ctx__), map(esc, args)...)
 end
 
 ###
@@ -312,9 +318,9 @@ function __index_Local_Cartesian end
 function __index_Group_Cartesian end
 function __index_Global_Cartesian end
 
-__index_Local_NTuple(I...) = Tuple(__index_Local_Cartesian(I...))
-__index_Group_NTuple(I...) = Tuple(__index_Group_Cartesian(I...))
-__index_Global_NTuple(I...) = Tuple(__index_Global_Cartesian(I...))
+__index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
+__index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
+__index_Global_NTuple(ctx, I...) = Tuple(__index_Global_Cartesian(ctx, I...))
 
 struct ConstAdaptor end
 
@@ -429,7 +435,7 @@ include("macros.jl")
 # Backends/Interface
 ###
 
-function Scratchpad(::Type{T}, ::Val{Dims}) where {T, Dims}
+function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
     throw(MethodError(Scratchpad, (T, Val(Dims))))
 end
 
 
@@ -32,8 +32,8 @@ include("compiler/pass.jl")
 
 function generate_overdubs(mod, Ctx)
    @eval mod begin
-        @inline Cassette.overdub(ctx::$Ctx, ::typeof(groupsize)) = __groupsize(ctx.metadata)
-        @inline Cassette.overdub(ctx::$Ctx, ::typeof(__workitems_iterspace)) = workitems(__iterspace(ctx.metadata))
+        @inline Cassette.overdub(::$Ctx, ::typeof(groupsize), ctx) = __groupsize(ctx)
+        @inline Cassette.overdub(::$Ctx, ::typeof(__workitems_iterspace), ctx) = workitems(__iterspace(ctx))
 
         ###
         # Cassette fixes