@@ -3,11 +3,13 @@ module ROCKernels
33export ROCBackend
44
55import AMDGPU
6+ import AMDGPU: rocconvert, hipfunction
67import AMDGPU. Device: @device_override
7- using AMDGPU: GPUArrays, rocSPARSE
8+ using AMDGPU: GPUArrays, rocSPARSE, HIP
89
910import Adapt
1011import KernelAbstractions as KA
12+ import KernelAbstractions. KernelIntrinsics as KI
1113import LLVM
1214
1315using StaticArraysCore: MArray
@@ -127,32 +129,57 @@ function KA.mkcontext(kernel::KA.Kernel{ROCBackend}, I, _ndrange, iterspace, ::D
127129 metadata = KA. CompilerMetadata {KA.ndrange(kernel), Dynamic} (I, _ndrange, iterspace)
128130end
129131
130- # Indexing.
132+ KI. argconvert (:: ROCBackend , arg) = rocconvert (arg)
133+
134+ function KI. kernel_function (:: ROCBackend , f:: F , tt:: TT = Tuple{}; name= nothing , kwargs... ) where {F,TT}
135+ kern = hipfunction (f, tt; name, kwargs... )
136+ KI. Kernel {ROCBackend, typeof(kern)} (ROCBackend (), kern)
137+ end
138+
139+ function (obj:: KI.Kernel{ROCBackend} )(args... ; numworkgroups = 1 , workgroupsize = 1 )
140+ KI. check_launch_args (numworkgroups, workgroupsize)
141+
142+ obj. kern (args... ; groupsize = workgroupsize, gridsize = numworkgroups)
143+ return nothing
144+ end
145+
131146
132- @device_override @inline function KA. __index_Local_Linear (ctx)
133- return AMDGPU. Device. threadIdx (). x
147+ function KI. kernel_max_work_group_size (kikern:: KI.Kernel{<:ROCBackend} ; max_work_items:: Int = Int (typemax (Int32))):: Int
148+ (; groupsize) = AMDGPU. launch_configuration (kikern. kern; max_block_size = max_work_items)
149+
150+ return Int (min (max_work_items, groupsize))
151+ end
152+ function KI. max_work_group_size (:: ROCBackend ):: Int
153+ Int (HIP. attribute (AMDGPU. HIP. device (), AMDGPU. HIP. hipDeviceAttributeMaxThreadsPerBlock))
154+ end
155+ function KI. multiprocessor_count (:: ROCBackend ):: Int
156+ Int (HIP. attribute (AMDGPU. HIP. device (), AMDGPU. HIP. hipDeviceAttributeMultiprocessorCount))
157+ end
158+
159+ # Indexing.
160+ # # COV_EXCL_START
161+ @device_override @inline function KI. get_local_id ()
162+ return (; x = Int (AMDGPU. Device. workitemIdx (). x), y = Int (AMDGPU. Device. workitemIdx (). y), z = Int (AMDGPU. Device. workitemIdx (). z))
134163end
135164
136- @device_override @inline function KA . __index_Group_Linear (ctx )
137- return AMDGPU. Device. blockIdx (). x
165+ @device_override @inline function KI . get_group_id ( )
166+ return (; x = Int ( AMDGPU. Device. workgroupIdx (). x), y = Int (AMDGPU . Device . workgroupIdx () . y), z = Int (AMDGPU . Device . workgroupIdx () . z))
138167end
139168
140- @device_override @inline function KA. __index_Global_Linear (ctx)
141- I = @inbounds KA. expand (KA. __iterspace (ctx), AMDGPU. Device. blockIdx (). x, AMDGPU. Device. threadIdx (). x)
142- # TODO : This is unfortunate, can we get the linear index cheaper
143- @inbounds LinearIndices (KA. __ndrange (ctx))[I]
169+ @device_override @inline function KI. get_global_id ()
170+ return (; x = Int ((AMDGPU. Device. workgroupIdx (). x- 1 )* AMDGPU. Device. blockDim (). x + AMDGPU. Device. workitemIdx (). x), y = Int ((AMDGPU. Device. workgroupIdx (). y- 1 )* AMDGPU. Device. blockDim (). y + AMDGPU. Device. workitemIdx (). y), z = Int ((AMDGPU. Device. workgroupIdx (). z- 1 )* AMDGPU. Device. blockDim (). z + AMDGPU. Device. workitemIdx (). z))
144171end
145172
146- @device_override @inline function KA . __index_Local_Cartesian (ctx )
147- @inbounds KA . workitems (KA . __iterspace (ctx))[ AMDGPU. Device. threadIdx (). x]
173+ @device_override @inline function KI . get_local_size ( )
174+ return (; x = Int (AMDGPU . Device . workgroupDim () . x), y = Int (AMDGPU . Device . workgroupDim () . y), z = Int ( AMDGPU. Device. workgroupDim (). z))
148175end
149176
150- @device_override @inline function KA . __index_Group_Cartesian (ctx )
151- @inbounds KA . blocks (KA . __iterspace (ctx))[ AMDGPU. Device. blockIdx (). x]
177+ @device_override @inline function KI . get_num_groups ( )
178+ return (; x = Int (AMDGPU . Device . gridGroupDim () . x), y = Int (AMDGPU . Device . gridGroupDim () . y), z = Int ( AMDGPU. Device. gridGroupDim (). z))
152179end
153180
154- @device_override @inline function KA . __index_Global_Cartesian (ctx )
155- return @inbounds KA . expand (KA . __iterspace (ctx), AMDGPU. Device. blockIdx (). x, AMDGPU. Device. threadIdx (). x )
181+ @device_override @inline function KI . get_global_size ( )
182+ return (; x = Int (AMDGPU . Device . gridItemDim () . x), y = Int ( AMDGPU. Device. gridItemDim (). y), z = Int ( AMDGPU. Device. gridItemDim (). z) )
156183end
157184
158185@device_override @inline function KA. __validindex (ctx)
166193
167194# Shared memory.
168195
169- @device_override @inline function KA . SharedMemory (:: Type{T} , :: Val{Dims} , :: Val{Id} ) where {T, Dims, Id }
170- ptr = AMDGPU. Device. alloc_special (Val (Id ), T, Val (AMDGPU. AS. Local), Val (prod (Dims)))
196+ @device_override @inline function KI . localmemory (:: Type{T} , :: Val{Dims} ) where {T, Dims}
197+ ptr = AMDGPU. Device. alloc_special (Val (:shmem ), T, Val (AMDGPU. AS. Local), Val (prod (Dims)))
171198 AMDGPU. ROCDeviceArray (Dims, ptr)
172199end
173200
@@ -177,12 +204,13 @@ end
177204
178205# Other.
179206
180- @device_override @inline function KA . __synchronize ()
207+ @device_override @inline function KI . barrier ()
181208 AMDGPU. Device. sync_workgroup ()
182209end
183210
184- @device_override @inline function KA . __print (args... )
211+ @device_override @inline function KI . _print (args... )
185212 # TODO
186213end
214+ # # COV_EXCL_STOP
187215
188216end
0 commit comments