Auto-detect SPIR-V extensions from device capabilities#410
Auto-detect SPIR-V extensions from device capabilities#410SimonDanisch wants to merge 5 commits intomasterfrom
Conversation
When using KernelAbstractions with features like Atomix.@atomic on Float32, the required SPIR-V extensions (e.g., SPV_EXT_shader_atomic_float_add) were not being enabled because KernelAbstractions calls @OpenCL internally without the `extensions` parameter. This change automatically queries the device's OpenCL extensions and maps them to the corresponding SPIR-V extensions that should be enabled during compilation. Currently supported mappings: - cl_ext_float_atomics -> SPV_EXT_shader_atomic_float_add, SPV_EXT_shader_atomic_float_min_max The mapping dictionary can be extended as needed for other extension pairs. Fixes the issue where Float32 atomics fail through KernelAbstractions even when the device supports cl_ext_float_atomics. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/intrinsics/src/integer.jl b/lib/intrinsics/src/integer.jl
index 1a76c38..9e82919 100644
--- a/lib/intrinsics/src/integer.jl
+++ b/lib/intrinsics/src/integer.jl
@@ -36,7 +36,7 @@ for gentype in generic_integer_types
@device_function popcount(x::$gentype) = @builtin_ccall("popcount", $gentype, ($gentype,), x)
-@device_override Base.bitreverse(x::$gentype) = @builtin_ccall("bit_reverse", $gentype, ($gentype,), x)
+ @device_override Base.bitreverse(x::$gentype) = @builtin_ccall("bit_reverse", $gentype, ($gentype,), x)
@device_function mad24(x::$gentype, y::$gentype, z::$gentype) = @builtin_ccall("mad24", $gentype, ($gentype, $gentype, $gentype), x, y, z)
@device_function mul24(x::$gentype, y::$gentype) = @builtin_ccall("mul24", $gentype, ($gentype, $gentype), x, y)
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index d084f3e..b02fb21 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -159,8 +159,10 @@ function compiler_config(dev::cl.Device; kwargs...)
end
return config
end
-@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
- extensions::Vector{String}=String[], kwargs...)
+@noinline function _compiler_config(
+ dev; kernel = true, name = nothing, always_inline = false,
+ extensions::Vector{String} = String[], kwargs...
+ )
supports_fp16 = "cl_khr_fp16" in dev.extensions
supports_fp64 = "cl_khr_fp64" in dev.extensions
@@ -170,8 +172,10 @@ end
all_extensions = unique!(vcat(extensions, auto_extensions))
# create GPUCompiler objects
- target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, validate=true,
- extensions=all_extensions, kwargs...)
+ target = SPIRVCompilerTarget(;
+ supports_fp16, supports_fp64, validate = true,
+ extensions = all_extensions, kwargs...
+ )
params = OpenCLCompilerParams()
CompilerConfig(target, params; kernel, name, always_inline)
end
diff --git a/test/spirv_extensions.jl b/test/spirv_extensions.jl
index 2645153..6f3db93 100644
--- a/test/spirv_extensions.jl
+++ b/test/spirv_extensions.jl
@@ -3,63 +3,63 @@ using Atomix: Atomix
@testset "spirv_extensions" begin
-@testset "bitreverse KernelAbstractions kernel" begin
- @kernel function bitreverse_ka!(out, inp)
- i = @index(Global)
- @inbounds out[i] = bitreverse(inp[i])
- end
+ @testset "bitreverse KernelAbstractions kernel" begin
+ @kernel function bitreverse_ka!(out, inp)
+ i = @index(Global)
+ @inbounds out[i] = bitreverse(inp[i])
+ end
- @testset "$T" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64]
- N = 64
- inp = CLArray(rand(T, N))
- out = similar(inp)
- bitreverse_ka!(OpenCLBackend())(out, inp; ndrange=N)
- synchronize(OpenCLBackend())
+ @testset "$T" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64]
+ N = 64
+ inp = CLArray(rand(T, N))
+ out = similar(inp)
+ bitreverse_ka!(OpenCLBackend())(out, inp; ndrange = N)
+ synchronize(OpenCLBackend())
- @test Array(out) == bitreverse.(Array(inp))
+ @test Array(out) == bitreverse.(Array(inp))
+ end
end
-end
-@testset "atomic float accumulation without manual extensions" begin
- # The auto-spirv-extensions feature detects cl_ext_float_atomics and enables
- # SPV_EXT_shader_atomic_float_add automatically. Previously this required
- # passing extensions=["SPV_EXT_shader_atomic_float_add"] to @opencl manually.
- #
- # We test with a concurrent accumulation pattern where multiple work-items
- # write to the same output locations. Without atomics this would race;
- # with Atomix.@atomic (which emits OpAtomicFAddEXT) the result must be exact.
- if "cl_ext_float_atomics" in cl.device().extensions
- @kernel function atomic_accum_kernel!(out, arr)
- i, j = @index(Global, NTuple)
- for k in 1:size(out, 1)
- Atomix.@atomic out[k, i] += arr[i, j]
+ @testset "atomic float accumulation without manual extensions" begin
+ # The auto-spirv-extensions feature detects cl_ext_float_atomics and enables
+ # SPV_EXT_shader_atomic_float_add automatically. Previously this required
+ # passing extensions=["SPV_EXT_shader_atomic_float_add"] to @opencl manually.
+ #
+ # We test with a concurrent accumulation pattern where multiple work-items
+ # write to the same output locations. Without atomics this would race;
+ # with Atomix.@atomic (which emits OpAtomicFAddEXT) the result must be exact.
+ if "cl_ext_float_atomics" in cl.device().extensions
+ @kernel function atomic_accum_kernel!(out, arr)
+ i, j = @index(Global, NTuple)
+ for k in 1:size(out, 1)
+ Atomix.@atomic out[k, i] += arr[i, j]
+ end
end
- end
- @testset "$T" for T in [Float32, Float64]
- if T == Float64 && !("cl_khr_fp64" in cl.device().extensions)
- continue
- end
+ @testset "$T" for T in [Float32, Float64]
+ if T == Float64 && !("cl_khr_fp64" in cl.device().extensions)
+ continue
+ end
- M, N = 32, 64
- img = zeros(T, M, N)
- img[5:15, 5:15] .= one(T)
- img[20:30, 20:30] .= T(2)
+ M, N = 32, 64
+ img = zeros(T, M, N)
+ img[5:15, 5:15] .= one(T)
+ img[20:30, 20:30] .= T(2)
- cl_img = CLArray(img)
- out = KernelAbstractions.zeros(OpenCLBackend(), T, M, N)
- atomic_accum_kernel!(OpenCLBackend())(out, cl_img; ndrange=(M, N))
- synchronize(OpenCLBackend())
+ cl_img = CLArray(img)
+ out = KernelAbstractions.zeros(OpenCLBackend(), T, M, N)
+ atomic_accum_kernel!(OpenCLBackend())(out, cl_img; ndrange = (M, N))
+ synchronize(OpenCLBackend())
- # Each out[k, i] = sum(img[i, :]) — accumulate row i across all columns
- out_host = Array(out)
- expected = zeros(T, M, N)
- for i in 1:M
- expected[:, i] .= sum(img[i, :])
+ # Each out[k, i] = sum(img[i, :]) — accumulate row i across all columns
+ out_host = Array(out)
+ expected = zeros(T, M, N)
+ for i in 1:M
+ expected[:, i] .= sum(img[i, :])
+ end
+ @test out_host ≈ expected
end
- @test out_host ≈ expected
end
end
-end
end |
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #410 +/- ##
===========================================
- Coverage 80.84% 31.17% -49.67%
===========================================
Files 12 11 -1
Lines 736 696 -40
===========================================
- Hits 595 217 -378
- Misses 141 479 +338 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Sorry for taking so long to review! I think this is a good approach for now. I was initially thinking it might make more sense to detect this from the generated code rather than unconditionally enabling it on all supported devices, but that seems a lot more complex and I am not sure it gains us much. Could you add some tests? I was also thinking it might make sense to separate the bitreverse changes into a separate PR, but I will leave that up to you. Thanks for the contribution! |
This is needed for supporting Atomix in KernelAbstractions, since we can't set the extension from a KA kernel launch.
This seems to be the most minimal fix, not sure if there are any problems with this or if it would need to be more complete.