JuliaGPU · christiangnrd · Nov 18, 2025 · Dec 17, 2025 · Dec 15, 2025 · Dec 17, 2025
diff --git a/Project.toml b/Project.toml
@@ -37,6 +37,10 @@ UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 
+[sources]
+KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"}
+AcceleratedKernels = {rev = "ka0.10simple", url = "https://github.com/christiangnrd/AcceleratedKernels.jl"}
+
 [extensions]
 AMDGPUChainRulesCoreExt = "ChainRulesCore"
 AMDGPUEnzymeCoreExt = "EnzymeCore"
@@ -53,7 +57,7 @@ ExprTools = "0.1"
 GPUArrays = "11.3.1"
 GPUCompiler = "1"
 GPUToolbox = "0.1.0, 0.2, 0.3, 1"
-KernelAbstractions = "0.9.2"
+KernelAbstractions = "0.9, 0.10"
 LLD_jll = "15, 16, 17, 18, 19"
 LLVM = "9"
 LLVM_jll = "15, 16, 17, 18, 19"

diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl
@@ -3,11 +3,13 @@ module ROCKernels
 export ROCBackend
 
 import AMDGPU
+import AMDGPU: rocconvert, hipfunction
 import AMDGPU.Device: @device_override
-using AMDGPU: GPUArrays, rocSPARSE
+using AMDGPU: GPUArrays, rocSPARSE, HIP
 
 import Adapt
 import KernelAbstractions as KA
+import KernelAbstractions.KernelIntrinsics as KI
 import LLVM
 
 using StaticArraysCore: MArray
@@ -127,32 +129,57 @@ function KA.mkcontext(kernel::KA.Kernel{ROCBackend}, I, _ndrange, iterspace, ::D
     metadata = KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
 end
 
-# Indexing.
+KI.argconvert(::ROCBackend, arg) = rocconvert(arg)
+
+function KI.kernel_function(::ROCBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    kern = hipfunction(f, tt; name, kwargs...)
+    KI.Kernel{ROCBackend, typeof(kern)}(ROCBackend(), kern)
+end
+
+function (obj::KI.Kernel{ROCBackend})(args...; numworkgroups = 1, workgroupsize = 1)
+    KI.check_launch_args(numworkgroups, workgroupsize)
+
+    obj.kern(args...; groupsize = workgroupsize, gridsize = numworkgroups)
+    return nothing
+end
+
 
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return AMDGPU.Device.threadIdx().x
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:ROCBackend}; max_work_items::Int=Int(typemax(Int32)))::Int
+    (; groupsize) = AMDGPU.launch_configuration(kikern.kern; max_block_size = max_work_items)
+
+    return Int(min(max_work_items, groupsize))
+end
+function KI.max_work_group_size(::ROCBackend)::Int
+    Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMaxThreadsPerBlock))
+end
+function KI.multiprocessor_count(::ROCBackend)::Int
+    Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMultiprocessorCount))
+end
+
+# Indexing.
+## COV_EXCL_START
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(AMDGPU.Device.workitemIdx().x), y = Int(AMDGPU.Device.workitemIdx().y), z = Int(AMDGPU.Device.workitemIdx().z))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return AMDGPU.Device.blockIdx().x
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(AMDGPU.Device.workgroupIdx().x), y = Int(AMDGPU.Device.workgroupIdx().y), z = Int(AMDGPU.Device.workgroupIdx().z))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    I =  @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
-    # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(KA.__ndrange(ctx))[I]
+@device_override @inline function KI.get_global_id()
+    return (; x = Int((AMDGPU.Device.workgroupIdx().x-1)*AMDGPU.Device.blockDim().x + AMDGPU.Device.workitemIdx().x), y = Int((AMDGPU.Device.workgroupIdx().y-1)*AMDGPU.Device.blockDim().y + AMDGPU.Device.workitemIdx().y), z = Int((AMDGPU.Device.workgroupIdx().z-1)*AMDGPU.Device.blockDim().z + AMDGPU.Device.workitemIdx().z))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[AMDGPU.Device.threadIdx().x]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(AMDGPU.Device.workgroupDim().x), y = Int(AMDGPU.Device.workgroupDim().y), z = Int(AMDGPU.Device.workgroupDim().z))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[AMDGPU.Device.blockIdx().x]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(AMDGPU.Device.gridGroupDim().x), y = Int(AMDGPU.Device.gridGroupDim().y), z = Int(AMDGPU.Device.gridGroupDim().z))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(AMDGPU.Device.gridItemDim().x), y = Int(AMDGPU.Device.gridItemDim().y), z = Int(AMDGPU.Device.gridItemDim().z))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -166,8 +193,8 @@ end
 
 # Shared memory.
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
-    ptr = AMDGPU.Device.alloc_special(Val(Id), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
+    ptr = AMDGPU.Device.alloc_special(Val(:shmem), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
     AMDGPU.ROCDeviceArray(Dims, ptr)
 end
 
@@ -177,12 +204,13 @@ end
 
 # Other.
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     AMDGPU.Device.sync_workgroup()
 end
 
-@device_override @inline function KA.__print(args...)
+@device_override @inline function KI._print(args...)
     # TODO
 end
+## COV_EXCL_STOP
 
 end
diff --git a/src/device/gcn/memory_static.jl b/src/device/gcn/memory_static.jl
@@ -2,7 +2,7 @@
 @generated function alloc_special(
     ::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit} = Val{false}(),
 ) where {id,T,as,len,zeroinit}
-    @dispose ctx=Context() begin
+    Context() do ctx
         eltyp = convert(LLVMType, T)
 
         # old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique
@@ -24,8 +24,8 @@
         gv = GlobalVariable(mod, gv_typ, string(id), as)
         if len > 0
             if as == AS.Local
-                linkage!(gv, LLVM.API.LLVMExternalLinkage)
-                # NOTE: Backend doesn't support initializer for local AS
+                linkage!(gv, LLVM.API.LLVMInternalLinkage)
+                initializer!(gv, UndefValue(gv_typ))
             elseif as == AS.Private
                 linkage!(gv, LLVM.API.LLVMInternalLinkage)
                 initializer!(gv, null(gv_typ))
@@ -38,7 +38,7 @@
         alignment!(gv, Base.max(32, Base.datatype_alignment(T)))
 
         # generate IR
-        @dispose builder=IRBuilder() begin
+        IRBuilder() do builder
             entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 

diff --git a/test/ka_tests.jl b/test/ka_tests.jl
@@ -6,7 +6,7 @@ include(joinpath(pkgdir(KernelAbstractions), "test", "testsuite.jl"))
 AMDGPU.allowscalar(false)
 
 # TODO fix Printing
-skip_tests = ["Printing", "sparse"]
+skip_tests = ["Printing", "sparse", "CPU synchronization", "fallback test: callable types",]
 if Sys.iswindows()
     # TODO
     # We do not support hostcalls on Windows yet.

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,3 +1,9 @@
+@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "AMDGPU.jl") == "AMDGPU.jl"
+    using Pkg
+    Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
+    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+end
+
 using AMDGPU
 using AMDGPU: Device, Runtime, @allowscalar
 import AMDGPU.Device: HostCallHolder, hostcall!
@@ -35,7 +41,8 @@ end
 
 AMDGPU.allowscalar(false)
 
-const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"]
+# const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"]
+const TEST_NAMES = ["kernelabstractions"]
 
 function parse_flags!(args, flag; default = nothing, typ = typeof(default))
     for f in args