Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ UnsafeAtomics = "013be700-e6cd-48c3-b4a1-df204f14c38f"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"

[sources]
KernelAbstractions = {rev = "main", url = "https://github.com/JuliaGPU/KernelAbstractions.jl"}
AcceleratedKernels = {rev = "ka0.10simple", url = "https://github.com/christiangnrd/AcceleratedKernels.jl"}

[extensions]
AMDGPUChainRulesCoreExt = "ChainRulesCore"
AMDGPUEnzymeCoreExt = "EnzymeCore"
Expand All @@ -53,7 +57,7 @@ ExprTools = "0.1"
GPUArrays = "11.3.1"
GPUCompiler = "1"
GPUToolbox = "0.1.0, 0.2, 0.3, 1"
KernelAbstractions = "0.9.2"
KernelAbstractions = "0.9, 0.10"
LLD_jll = "15, 16, 17, 18, 19"
LLVM = "9"
LLVM_jll = "15, 16, 17, 18, 19"
Expand Down
68 changes: 48 additions & 20 deletions src/ROCKernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ module ROCKernels
export ROCBackend

import AMDGPU
import AMDGPU: rocconvert, hipfunction
import AMDGPU.Device: @device_override
using AMDGPU: GPUArrays, rocSPARSE
using AMDGPU: GPUArrays, rocSPARSE, HIP

import Adapt
import KernelAbstractions as KA
import KernelAbstractions.KernelIntrinsics as KI
import LLVM

using StaticArraysCore: MArray
Expand Down Expand Up @@ -127,32 +129,57 @@ function KA.mkcontext(kernel::KA.Kernel{ROCBackend}, I, _ndrange, iterspace, ::D
metadata = KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
end

# Indexing.
KI.argconvert(::ROCBackend, arg) = rocconvert(arg)

function KI.kernel_function(::ROCBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
kern = hipfunction(f, tt; name, kwargs...)
KI.Kernel{ROCBackend, typeof(kern)}(ROCBackend(), kern)
end

function (obj::KI.Kernel{ROCBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)

obj.kern(args...; groupsize = workgroupsize, gridsize = numworkgroups)
return nothing
end


@device_override @inline function KA.__index_Local_Linear(ctx)
return AMDGPU.Device.threadIdx().x
function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:ROCBackend}; max_work_items::Int=Int(typemax(Int32)))::Int
(; groupsize) = AMDGPU.launch_configuration(kikern.kern; max_block_size = max_work_items)

return Int(min(max_work_items, groupsize))
end
function KI.max_work_group_size(::ROCBackend)::Int
Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMaxThreadsPerBlock))
end
function KI.multiprocessor_count(::ROCBackend)::Int
Int(HIP.attribute(AMDGPU.HIP.device(), AMDGPU.HIP.hipDeviceAttributeMultiprocessorCount))
end

# Indexing.
## COV_EXCL_START
@device_override @inline function KI.get_local_id()
return (; x = Int(AMDGPU.Device.workitemIdx().x), y = Int(AMDGPU.Device.workitemIdx().y), z = Int(AMDGPU.Device.workitemIdx().z))
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return AMDGPU.Device.blockIdx().x
@device_override @inline function KI.get_group_id()
return (; x = Int(AMDGPU.Device.workgroupIdx().x), y = Int(AMDGPU.Device.workgroupIdx().y), z = Int(AMDGPU.Device.workgroupIdx().z))
end

@device_override @inline function KA.__index_Global_Linear(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
# TODO: This is unfortunate, can we get the linear index cheaper
@inbounds LinearIndices(KA.__ndrange(ctx))[I]
@device_override @inline function KI.get_global_id()
return (; x = Int((AMDGPU.Device.workgroupIdx().x-1)*AMDGPU.Device.blockDim().x + AMDGPU.Device.workitemIdx().x), y = Int((AMDGPU.Device.workgroupIdx().y-1)*AMDGPU.Device.blockDim().y + AMDGPU.Device.workitemIdx().y), z = Int((AMDGPU.Device.workgroupIdx().z-1)*AMDGPU.Device.blockDim().z + AMDGPU.Device.workitemIdx().z))
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[AMDGPU.Device.threadIdx().x]
@device_override @inline function KI.get_local_size()
return (; x = Int(AMDGPU.Device.workgroupDim().x), y = Int(AMDGPU.Device.workgroupDim().y), z = Int(AMDGPU.Device.workgroupDim().z))
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[AMDGPU.Device.blockIdx().x]
@device_override @inline function KI.get_num_groups()
return (; x = Int(AMDGPU.Device.gridGroupDim().x), y = Int(AMDGPU.Device.gridGroupDim().y), z = Int(AMDGPU.Device.gridGroupDim().z))
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
@device_override @inline function KI.get_global_size()
return (; x = Int(AMDGPU.Device.gridItemDim().x), y = Int(AMDGPU.Device.gridItemDim().y), z = Int(AMDGPU.Device.gridItemDim().z))
end

@device_override @inline function KA.__validindex(ctx)
Expand All @@ -166,8 +193,8 @@ end

# Shared memory.

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = AMDGPU.Device.alloc_special(Val(Id), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
ptr = AMDGPU.Device.alloc_special(Val(:shmem), T, Val(AMDGPU.AS.Local), Val(prod(Dims)))
AMDGPU.ROCDeviceArray(Dims, ptr)
end

Expand All @@ -177,12 +204,13 @@ end

# Other.

@device_override @inline function KA.__synchronize()
@device_override @inline function KI.barrier()
AMDGPU.Device.sync_workgroup()
end

@device_override @inline function KA.__print(args...)
@device_override @inline function KI._print(args...)
# TODO
end
## COV_EXCL_STOP

end
8 changes: 4 additions & 4 deletions src/device/gcn/memory_static.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
@generated function alloc_special(
::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit} = Val{false}(),
) where {id,T,as,len,zeroinit}
@dispose ctx=Context() begin
Context() do ctx
eltyp = convert(LLVMType, T)

# old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique
Expand All @@ -24,8 +24,8 @@
gv = GlobalVariable(mod, gv_typ, string(id), as)
if len > 0
if as == AS.Local
linkage!(gv, LLVM.API.LLVMExternalLinkage)
# NOTE: Backend doesn't support initializer for local AS
linkage!(gv, LLVM.API.LLVMInternalLinkage)
initializer!(gv, UndefValue(gv_typ))
elseif as == AS.Private
linkage!(gv, LLVM.API.LLVMInternalLinkage)
initializer!(gv, null(gv_typ))
Expand All @@ -38,7 +38,7 @@
alignment!(gv, Base.max(32, Base.datatype_alignment(T)))

# generate IR
@dispose builder=IRBuilder() begin
IRBuilder() do builder
entry = BasicBlock(llvm_f, "entry")
position!(builder, entry)

Expand Down
2 changes: 1 addition & 1 deletion test/ka_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include(joinpath(pkgdir(KernelAbstractions), "test", "testsuite.jl"))
AMDGPU.allowscalar(false)

# TODO fix Printing
skip_tests = ["Printing", "sparse"]
skip_tests = ["Printing", "sparse", "CPU synchronization", "fallback test: callable types",]
if Sys.iswindows()
# TODO
# We do not support hostcalls on Windows yet.
Expand Down
9 changes: 8 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "AMDGPU.jl") == "AMDGPU.jl"
using Pkg
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
end

using AMDGPU
using AMDGPU: Device, Runtime, @allowscalar
import AMDGPU.Device: HostCallHolder, hostcall!
Expand Down Expand Up @@ -35,7 +41,8 @@ end

AMDGPU.allowscalar(false)

const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"]
# const TEST_NAMES = ["core", "hip", "ext", "gpuarrays", "kernelabstractions", "enzyme"]
const TEST_NAMES = ["kernelabstractions"]

function parse_flags!(args, flag; default = nothing, typ = typeof(default))
for f in args
Expand Down