KernelIntrisics and related tweaks (#666)

christiangnrd · web-flow · commit dc57cd0af41d · 2026-01-12T21:51:30.000-04:00
* KernelIntrinsics Tweaks

* Fix temporary AK compat

* Improve KI tests
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -109,8 +109,7 @@ steps:
       julia -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
                 Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -141,8 +140,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing AMDGPU")
                 using Pkg
-                Pkg.develop(; name="AcceleratedKernels")'
-      sed -i 's/^KernelAbstractions = "0\.9.*"/KernelAbstractions = "0.9, 0.10"/' \${JULIA_DEPOT_PATH}/dev/AcceleratedKernels/Project.toml
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e '
                 using Pkg
                 Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -833,7 +833,7 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
+SharedMemory(::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(T, dims)
 
 __synchronize() = KI.barrier()
 
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
@@ -103,14 +103,14 @@ Returns the unique group ID.
 function get_group_id end
 
 """
-    localmemory(T, dims)
+    localmemory(::Type{T}, dims)
 
 Declare memory that is local to a workgroup.
 
 !!! note
     Backend implementations **must** implement:
     ```
-    @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    @device_override localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ```
     As well as the on-device functionality.
 """
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
@@ -1,21 +1,25 @@
 import KernelAbstractions.KernelIntrinsics as KI
 
+struct KernelData
+    global_size::Int
+    global_id::Int
+    local_size::Int
+    local_id::Int
+    num_groups::Int
+    group_id::Int
+end
 function test_intrinsics_kernel(results)
-    # Test all intrinsics return NamedTuples with x, y, z fields
-    global_size = KI.get_global_size()
-    global_id = KI.get_global_id()
-    local_size = KI.get_local_size()
-    local_id = KI.get_local_id()
-    num_groups = KI.get_num_groups()
-    group_id = KI.get_group_id()
-
-    if UInt32(global_id.x) <= UInt32(global_size.x)
-        results[1, global_id.x] = global_id.x
-        results[2, global_id.x] = local_id.x
-        results[3, global_id.x] = group_id.x
-        results[4, global_id.x] = global_size.x
-        results[5, global_id.x] = local_size.x
-        results[6, global_id.x] = num_groups.x
+    i = KI.get_global_id().x
+
+    if i <= length(results)
+        @inbounds results[i] = KernelData(
+            KI.get_global_size().x,
+            KI.get_global_id().x,
+            KI.get_local_size().x,
+            KI.get_local_id().x,
+            KI.get_num_groups().x,
+            KI.get_group_id().x
+        )
     end
     return
 end
@@ -82,41 +86,40 @@ function intrinsics_testsuite(backend, AT)
             @test KI.multiprocessor_count(backend()) isa Int
 
             # Test with small kernel
-            N = 16
-            results = AT(zeros(Int, 6, N))
+            workgroupsize = 4
+            numworkgroups = 4
+            N = workgroupsize * numworkgroups
+            results = AT(Vector{KernelData}(undef, N))
             kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(kernel) isa Int
             @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
 
-            kernel(results, workgroupsize = 4, numworkgroups = 4)
+            kernel(results; workgroupsize, numworkgroups)
             KernelAbstractions.synchronize(backend())
 
             host_results = Array(results)
 
             # Verify results make sense
-            for i in 1:N
-                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+            for (i, k_data) in enumerate(host_results)
 
                 # Global IDs should be 1-based and sequential
-                @test global_id_x == i
+                @test k_data.global_id == i
 
                 # Global size should match our ndrange
-                @test global_size_x == N
+                @test k_data.global_size == N
 
-                # Local size should be 4 (our workgroupsize)
-                @test local_size_x == 4
+                @test k_data.local_size == workgroupsize
 
-                # Number of groups should be ceil(N/4) = 4
-                @test num_groups_x == 4
+                @test k_data.num_groups == numworkgroups
 
                 # Group ID should be 1-based
-                expected_group = div(i - 1, 4) + 1
-                @test group_id_x == expected_group
+                expected_group = div(i - 1, numworkgroups) + 1
+                @test k_data.group_id == expected_group
 
                 # Local ID should be 1-based within group
-                expected_local = ((i - 1) % 4) + 1
-                @test local_id_x == expected_local
+                expected_local = ((i - 1) % workgroupsize) + 1
+                @test k_data.local_id == expected_local
             end
         end
     end