Skip to content

Commit 48b807e

Browse files
committed
KernelIntrinsics
1 parent 0c3073c commit 48b807e

File tree

2 files changed

+49
-21
lines changed

2 files changed

+49
-21
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ ExprTools = "0.1"
5353
GPUArrays = "11.3.1"
5454
GPUCompiler = "1"
5555
GPUToolbox = "0.1.0, 0.2, 0.3, 1"
56-
KernelAbstractions = "0.9.2"
56+
KernelAbstractions = "0.10"
5757
LLD_jll = "15, 16, 17, 18, 19"
5858
LLVM = "9"
5959
LLVM_jll = "15, 16, 17, 18, 19"

src/ROCKernels.jl

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ module ROCKernels
33
export ROCBackend
44

55
import AMDGPU
6+
import AMDGPU: rocconvert, hipfunction
67
import AMDGPU.Device: @device_override
7-
using AMDGPU: GPUArrays, rocSPARSE
8+
using AMDGPU: GPUArrays, rocSPARSE, HIP
89

910
import Adapt
1011
import KernelAbstractions as KA
12+
import KernelAbstractions.KernelIntrinsics as KI
1113
import LLVM
1214

1315
using StaticArraysCore: MArray
@@ -127,32 +129,57 @@ function KA.mkcontext(kernel::KA.Kernel{ROCBackend}, I, _ndrange, iterspace, ::D
127129
metadata = KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
128130
end
129131

130-
# Indexing.
132+
KI.argconvert(::ROCBackend, arg) = rocconvert(arg)
133+
134+
function KI.kernel_function(::ROCBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
135+
kern = hipfunction(f, tt; name, kwargs...)
136+
KI.Kernel{ROCBackend, typeof(kern)}(ROCBackend(), kern)
137+
end
138+
139+
function (obj::KI.Kernel{ROCBackend})(args...; numworkgroups = 1, workgroupsize = 1)
140+
KI.check_launch_args(numworkgroups, workgroupsize)
141+
142+
obj.kern(args...; groupsize = workgroupsize, gridsize = numworkgroups)
143+
return nothing
144+
end
145+
131146

132-
@device_override @inline function KA.__index_Local_Linear(ctx)
133-
return AMDGPU.Device.threadIdx().x
147+
function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:ROCBackend}; max_work_items::Int=Int(typemax(Int32)))::Int
148+
(; groupsize) = AMDGPU.launch_configuration(kikern.kern; max_block_size = max_work_items)
149+
150+
return Int(min(max_work_items, groupsize))
151+
end
152+
function KI.max_work_group_size(::ROCBackend)::Int
153+
Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMaxThreadsPerBlock))
154+
end
155+
function KI.multiprocessor_count(::ROCBackend)::Int
156+
Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMultiprocessorCount))
157+
end
158+
159+
# Indexing.
160+
## COV_EXCL_START
161+
@device_override @inline function KI.get_local_id()
162+
return (; x = Int(AMDGPU.Device.workitemIdx().x), y = Int(AMDGPU.Device.workitemIdx().y), z = Int(AMDGPU.Device.workitemIdx().z))
134163
end
135164

136-
@device_override @inline function KA.__index_Group_Linear(ctx)
137-
return AMDGPU.Device.blockIdx().x
165+
@device_override @inline function KI.get_group_id()
166+
return (; x = Int(AMDGPU.Device.workgroupIdx().x), y = Int(AMDGPU.Device.workgroupIdx().y), z = Int(AMDGPU.Device.workgroupIdx().z))
138167
end
139168

140-
@device_override @inline function KA.__index_Global_Linear(ctx)
141-
I = @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
142-
# TODO: This is unfortunate, can we get the linear index cheaper
143-
@inbounds LinearIndices(KA.__ndrange(ctx))[I]
169+
@device_override @inline function KI.get_global_id()
170+
return (; x = Int((AMDGPU.Device.workgroupIdx().x-1)*AMDGPU.Device.blockDim().x + AMDGPU.Device.workitemIdx().x), y = Int((AMDGPU.Device.workgroupIdx().y-1)*AMDGPU.Device.blockDim().y + AMDGPU.Device.workitemIdx().y), z = Int((AMDGPU.Device.workgroupIdx().z-1)*AMDGPU.Device.blockDim().z + AMDGPU.Device.workitemIdx().z))
144171
end
145172

146-
@device_override @inline function KA.__index_Local_Cartesian(ctx)
147-
@inbounds KA.workitems(KA.__iterspace(ctx))[AMDGPU.Device.threadIdx().x]
173+
@device_override @inline function KI.get_local_size()
174+
return (; x = Int(AMDGPU.Device.workgroupDim().x), y = Int(AMDGPU.Device.workgroupDim().y), z = Int(AMDGPU.Device.workgroupDim().z))
148175
end
149176

150-
@device_override @inline function KA.__index_Group_Cartesian(ctx)
151-
@inbounds KA.blocks(KA.__iterspace(ctx))[AMDGPU.Device.blockIdx().x]
177+
@device_override @inline function KI.get_num_groups()
178+
return (; x = Int(AMDGPU.Device.gridGroupDim().x), y = Int(AMDGPU.Device.gridGroupDim().y), z = Int(AMDGPU.Device.gridGroupDim().z))
152179
end
153180

154-
@device_override @inline function KA.__index_Global_Cartesian(ctx)
155-
return @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
181+
@device_override @inline function KI.get_global_size()
182+
return (; x = Int(AMDGPU.Device.gridItemDim().x), y = Int(AMDGPU.Device.gridItemDim().y), z = Int(AMDGPU.Device.gridItemDim().z))
156183
end
157184

158185
@device_override @inline function KA.__validindex(ctx)
@@ -166,8 +193,8 @@ end
166193

167194
# Shared memory.
168195

169-
@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
170-
ptr = AMDGPU.Device.alloc_special(Val(Id), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
196+
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
197+
ptr = AMDGPU.Device.alloc_special(Val(:shmem), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
171198
AMDGPU.ROCDeviceArray(Dims, ptr)
172199
end
173200

@@ -177,12 +204,13 @@ end
177204

178205
# Other.
179206

180-
@device_override @inline function KA.__synchronize()
207+
@device_override @inline function KI.barrier()
181208
AMDGPU.Device.sync_workgroup()
182209
end
183210

184-
@device_override @inline function KA.__print(args...)
211+
@device_override @inline function KI._print(args...)
185212
# TODO
186213
end
214+
## COV_EXCL_STOP
187215

188216
end

0 commit comments

Comments
 (0)