diff --git a/Project.toml b/Project.toml index 7107e98d..8a6a2375 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "oneAPI" uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" authors = ["Tim Besard "] -version = "2.4.0" +version = "2.4.1" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" @@ -37,18 +37,19 @@ Adapt = "4" CEnum = "0.4, 0.5" ExprTools = "0.1" GPUArrays = "11.2.1" -GPUCompiler = "1.5" +GPUCompiler = "1.6" GPUToolbox = "0.1, 0.2, 0.3, 1" KernelAbstractions = "0.9.1" LLVM = "6, 7, 8, 9" -NEO_jll = "=25.31.34666" +NEO_jll = "=25.35.35096" Preferences = "1" -SPIRVIntrinsics = "0.2" -SPIRV_LLVM_Translator_jll = "20" +SPIRVIntrinsics = "0.5" +SPIRV_LLVM_Translator_jll = "21" +SPIRV_Tools_jll = "2025.4.0" SpecialFunctions = "1.3, 2" StaticArrays = "1" julia = "1.10" -oneAPI_Level_Zero_Loader_jll = "1.22" +oneAPI_Level_Zero_Loader_jll = "1.24" oneAPI_Support_jll = "0.9.2" [extras] diff --git a/README.md b/README.md index 8b87ccc6..f4ebdf60 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ translator](https://github.com/KhronosGroup/SPIRV-LLVM-Translator): ```julia julia> function kernel() - barrier() + barrier(0) return end diff --git a/src/array.jl b/src/array.jl index edc6b449..d576cdb7 100644 --- a/src/array.jl +++ b/src/array.jl @@ -279,8 +279,8 @@ end ## interop with GPU arrays -function Base.unsafe_convert(::Type{oneDeviceArray{T,N,AS.Global}}, a::oneArray{T,N}) where {T,N} - oneDeviceArray{T,N,AS.Global}(size(a), reinterpret(LLVMPtr{T,AS.Global}, pointer(a)), +function Base.unsafe_convert(::Type{oneDeviceArray{T,N,AS.CrossWorkgroup}}, a::oneArray{T,N}) where {T,N} + oneDeviceArray{T,N,AS.CrossWorkgroup}(size(a), reinterpret(LLVMPtr{T,AS.CrossWorkgroup}, pointer(a)), a.maxsize - a.offset*Base.elsize(a)) end diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 5fbcb9c9..aeb83f40 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -6,7 +6,8 @@ const oneAPICompilerJob = CompilerJob{SPIRVCompilerTarget,oneAPICompilerParams} GPUCompiler.runtime_module(::oneAPICompilerJob) = oneAPI -GPUCompiler.method_table(::oneAPICompilerJob) = method_table +GPUCompiler.method_table_view(job::oneAPICompilerJob) = + GPUCompiler.StackedMethodTable(job.world, method_table, SPIRVIntrinsics.method_table) # filter out OpenCL built-ins # TODO: eagerly lower these using the translator API @@ -14,7 +15,8 @@ GPUCompiler.isintrinsic(job::oneAPICompilerJob, fn::String) = invoke(GPUCompiler.isintrinsic, Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)}, job, fn) || - in(fn, opencl_builtins) + in(fn, known_intrinsics) || + contains(fn, "__spirv_") function GPUCompiler.finish_module!(job::oneAPICompilerJob, mod::LLVM.Module, entry::LLVM.Function) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 7101eaae..6503a9b0 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -88,7 +88,7 @@ Adapt.adapt_storage(to::KernelAdaptor, p::ZePtr{T}) where {T} = reinterpret(Ptr{ # convert oneAPI host arrays to device arrays Adapt.adapt_storage(::KernelAdaptor, xs::oneArray{T,N}) where {T,N} = - Base.unsafe_convert(oneDeviceArray{T,N,AS.Global}, xs) + Base.unsafe_convert(oneDeviceArray{T,N,AS.CrossWorkgroup}, xs) # Base.RefValue isn't GPU compatible, so provide a compatible alternative. # TODO: port improvements from CUDA.jl diff --git a/src/device/quirks.jl b/src/device/quirks.jl index ab532f40..987922a5 100644 --- a/src/device/quirks.jl +++ b/src/device/quirks.jl @@ -60,10 +60,13 @@ end # From Metal.jl to avoid widemul and Int128 @static if VERSION >= v"1.12.0-DEV.1736" # Partially reverts JuliaLang/julia PR #56750 - let BitInteger64 = Union{Int64, UInt64} - @device_override function Base.checkbounds(::Type{Bool}, v::StepRange{<:BitInteger64, <:BitInteger64}, i::BitInteger64) - @inline - return checkindex(Bool, eachindex(IndexLinear(), v), i) - end + const BitInteger64 = Union{Int64, UInt64} + @device_override function Base.checkbounds(::Type{Bool}, v::StepRange{<:BitInteger64, <:BitInteger64}, i::BitInteger64) + @inline + return checkindex(Bool, eachindex(IndexLinear(), v), i) end + + # Less accurate division for Float32 than Base Julia which relies on Float64 + # https://github.com/JuliaLang/julia/pull/49637 + @device_override Base.div(x::Float32, y::Float32) = trunc(x / y) end diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 7f3d2e55..fd1a2c71 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -16,7 +16,7 @@ # perform a reduction d = 1 while d < items - barrier() + barrier(0) index = 2 * d * (item-1) + 1 @inbounds if index <= items other_val = if index + d <= items diff --git a/src/oneAPI.jl b/src/oneAPI.jl index b7f8b527..9e39fa9f 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -30,6 +30,7 @@ functional() = oneL0.functional[] import SPIRVIntrinsics SPIRVIntrinsics.@import_all SPIRVIntrinsics.@reexport_public +Base.Experimental.@MethodTable(method_table) include("device/runtime.jl") include("device/array.jl") include("device/quirks.jl") diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl index 66729b57..2fd144ad 100644 --- a/src/oneAPIKernels.jl +++ b/src/oneAPIKernels.jl @@ -1,7 +1,7 @@ module oneAPIKernels using ..oneAPI -using ..oneAPI: @device_override +using ..oneAPI: @device_override, SPIRVIntrinsics, method_table import KernelAbstractions as KA @@ -161,7 +161,7 @@ end ## Synchronization and Printing @device_override @inline function KA.__synchronize() - barrier() + barrier(0) end @device_override @inline function KA.__print(args...) diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl index 713d55ef..5e5605ef 100644 --- a/test/device/intrinsics.jl +++ b/test/device/intrinsics.jl @@ -226,7 +226,7 @@ end s[t] = d[t] s2[t] = 2*d[t] - barrier() + barrier(0) d[t] = s[tr] return @@ -252,7 +252,7 @@ end s[t] = d[t] s2[t] = d[t] - barrier() + barrier(0) d[t] = s[tr] return diff --git a/test/execution.jl b/test/execution.jl index 596d0d8f..cd3db014 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -569,18 +569,18 @@ end r[tx] = r_[tx] - barrier() + barrier(0) for j=1:n if tx == 1 r[j] = r[j] / 2f0 end - barrier() + barrier(0) if tx > j && tx <= 4 r[tx] = r[tx] - 2f0*r[j] end - barrier() + barrier(0) end if bx == 1 @@ -606,7 +606,7 @@ end # conversions from integers to pointers resulted in lost memory stores function kernel(ptr) - ptr = reinterpret(Core.LLVMPtr{Float32, AS.Global}, ptr) + ptr = reinterpret(Core.LLVMPtr{Float32, AS.CrossWorkgroup}, ptr) unsafe_store!(ptr, 42) return end