KernelIntrinsics

christiangnrd · christiangnrd · commit 792451cdcf84 · 2025-10-22T01:01:12.000-03:00
diff --git a/Project.toml b/Project.toml
@@ -42,7 +42,7 @@ ExprTools = "0.1"
 GPUArrays = "11.2.1"
 GPUCompiler = "1.7.1"
 GPUToolbox = "0.1, 0.2, 0.3, 1"
-KernelAbstractions = "0.9.38"
+KernelAbstractions = "0.10"
 LLVM = "7.2, 8, 9"
 LLVMDowngrader_jll = "0.6"
 LinearAlgebra = "1"
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
@@ -4,6 +4,7 @@ using ..Metal
 using ..Metal: @device_override, DefaultStorageMode, SharedStorage
 
 import KernelAbstractions as KA
+import KernelAbstractions: KernelIntrinsics as KI
 
 using StaticArrays: MArray
 
@@ -133,35 +134,58 @@ function (obj::KA.Kernel{MetalBackend})(args...; ndrange=nothing, workgroupsize=
     return nothing
 end
 
+function KI.KIKernel(::MetalBackend, f, args...; kwargs...)
+    kern = eval(quote
+        @metal launch=false $(kwargs...) $(f)($(args...))
+    end)
+    KI.KIKernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+end
+
+function (obj::KI.KIKernel{MetalBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+    threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
+    threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
+
+    obj.kern(args...; threads=threadsPerThreadgroup, groups=threadgroupsPerGrid)
+end
+
+
+function KI.kernel_max_work_group_size(::B, kikern::KI.KIKernel{B}; max_work_items::Int=typemax(Int)) where B<:MetalBackend
+    min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items)
+end
+function KI.max_work_group_size(::MetalBackend)
+    device().maxThreadsPerThreadgroup.width
+end
+function KI.multiprocessor_count(::MetalBackend)
+    Metal.num_gpu_cores()
+end
+
+
 
 ## indexing
 
 ## COV_EXCL_START
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return thread_position_in_threadgroup().x
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(thread_position_in_threadgroup().x), y = Int(thread_position_in_threadgroup().y), z = Int(thread_position_in_threadgroup().z))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return threadgroup_position_in_grid().x
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(threadgroup_position_in_grid().x), y = Int(threadgroup_position_in_grid().y), z = Int(threadgroup_position_in_grid().z))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    I =  @inbounds KA.expand(KA.__iterspace(ctx), threadgroup_position_in_grid().x, thread_position_in_threadgroup().x)
-    # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(KA.__ndrange(ctx))[I]
+@device_override @inline function KI.get_global_id()
+    return (; x = Int(thread_position_in_grid().x), y = Int(thread_position_in_grid().y), z = Int(thread_position_in_grid().z))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[thread_position_in_threadgroup().x]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(threads_per_threadgroup().x), y = Int(threads_per_threadgroup().y), z = Int(threads_per_threadgroup().z))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[threadgroup_position_in_grid().x]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(threadgroups_per_grid().x), y = Int(threadgroups_per_grid().y), z = Int(threadgroups_per_grid().z))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), threadgroup_position_in_grid().x,
-                               thread_position_in_threadgroup().x)
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(threads_per_grid().x), y = Int(threads_per_grid().y), z = Int(threads_per_grid().z))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -177,8 +201,7 @@ end
 
 ## shared memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims},
-                                                  ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ptr = Metal.emit_threadgroup_memory(T, Val(prod(Dims)))
     MtlDeviceArray(Dims, ptr)
 end
@@ -190,7 +213,7 @@ end
 
 ## other
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     threadgroup_barrier(Metal.MemoryFlagDevice | Metal.MemoryFlagThreadGroup)
 end