diff --git a/Project.toml b/Project.toml
index 7107e98d..8a6a2375 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "oneAPI"
 uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 authors = ["Tim Besard <tim.besard@gmail.com>"]
-version = "2.4.0"
+version = "2.4.1"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -37,18 +37,19 @@ Adapt = "4"
 CEnum = "0.4, 0.5"
 ExprTools = "0.1"
 GPUArrays = "11.2.1"
-GPUCompiler = "1.5"
+GPUCompiler = "1.6"
 GPUToolbox = "0.1, 0.2, 0.3, 1"
 KernelAbstractions = "0.9.1"
 LLVM = "6, 7, 8, 9"
-NEO_jll = "=25.31.34666"
+NEO_jll = "=25.35.35096"
 Preferences = "1"
-SPIRVIntrinsics = "0.2"
-SPIRV_LLVM_Translator_jll = "20"
+SPIRVIntrinsics = "0.5"
+SPIRV_LLVM_Translator_jll = "21"
+SPIRV_Tools_jll = "2025.4.0"
 SpecialFunctions = "1.3, 2"
 StaticArrays = "1"
 julia = "1.10"
-oneAPI_Level_Zero_Loader_jll = "1.22"
+oneAPI_Level_Zero_Loader_jll = "1.24"
 oneAPI_Support_jll = "0.9.2"
 
 [extras]
diff --git a/README.md b/README.md
index 8b87ccc6..f4ebdf60 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ translator](https://github.com/KhronosGroup/SPIRV-LLVM-Translator):
 
 ```julia
 julia> function kernel()
-         barrier()
+         barrier(0)
          return
        end
 
diff --git a/src/array.jl b/src/array.jl
index edc6b449..d576cdb7 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -279,8 +279,8 @@ end
 
 ## interop with GPU arrays
 
-function Base.unsafe_convert(::Type{oneDeviceArray{T,N,AS.Global}}, a::oneArray{T,N}) where {T,N}
-  oneDeviceArray{T,N,AS.Global}(size(a), reinterpret(LLVMPtr{T,AS.Global}, pointer(a)),
+function Base.unsafe_convert(::Type{oneDeviceArray{T,N,AS.CrossWorkgroup}}, a::oneArray{T,N}) where {T,N}
+  oneDeviceArray{T,N,AS.CrossWorkgroup}(size(a), reinterpret(LLVMPtr{T,AS.CrossWorkgroup}, pointer(a)),
                                 a.maxsize - a.offset*Base.elsize(a))
 end
 
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 5fbcb9c9..aeb83f40 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -6,7 +6,8 @@ const oneAPICompilerJob = CompilerJob{SPIRVCompilerTarget,oneAPICompilerParams}
 
 GPUCompiler.runtime_module(::oneAPICompilerJob) = oneAPI
 
-GPUCompiler.method_table(::oneAPICompilerJob) = method_table
+GPUCompiler.method_table_view(job::oneAPICompilerJob) =
+    GPUCompiler.StackedMethodTable(job.world, method_table, SPIRVIntrinsics.method_table)
 
 # filter out OpenCL built-ins
 # TODO: eagerly lower these using the translator API
@@ -14,7 +15,8 @@ GPUCompiler.isintrinsic(job::oneAPICompilerJob, fn::String) =
     invoke(GPUCompiler.isintrinsic,
            Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)},
            job, fn) ||
-    in(fn, opencl_builtins)
+    in(fn, known_intrinsics) ||
+    contains(fn, "__spirv_")
 
 function GPUCompiler.finish_module!(job::oneAPICompilerJob, mod::LLVM.Module,
                                     entry::LLVM.Function)
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 7101eaae..6503a9b0 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -88,7 +88,7 @@ Adapt.adapt_storage(to::KernelAdaptor, p::ZePtr{T}) where {T} = reinterpret(Ptr{
 
 # convert oneAPI host arrays to device arrays
 Adapt.adapt_storage(::KernelAdaptor, xs::oneArray{T,N}) where {T,N} =
-  Base.unsafe_convert(oneDeviceArray{T,N,AS.Global}, xs)
+  Base.unsafe_convert(oneDeviceArray{T,N,AS.CrossWorkgroup}, xs)
 
 # Base.RefValue isn't GPU compatible, so provide a compatible alternative.
 # TODO: port improvements from CUDA.jl
diff --git a/src/device/quirks.jl b/src/device/quirks.jl
index ab532f40..987922a5 100644
--- a/src/device/quirks.jl
+++ b/src/device/quirks.jl
@@ -60,10 +60,13 @@ end
 
 # From Metal.jl to avoid widemul and Int128
 @static if VERSION >= v"1.12.0-DEV.1736" # Partially reverts JuliaLang/julia PR #56750
-    let BitInteger64 = Union{Int64, UInt64}
-        @device_override function Base.checkbounds(::Type{Bool}, v::StepRange{<:BitInteger64, <:BitInteger64}, i::BitInteger64)
-            @inline
-            return checkindex(Bool, eachindex(IndexLinear(), v), i)
-        end
+    const BitInteger64 = Union{Int64, UInt64}
+    @device_override function Base.checkbounds(::Type{Bool}, v::StepRange{<:BitInteger64, <:BitInteger64}, i::BitInteger64)
+        @inline
+        return checkindex(Bool, eachindex(IndexLinear(), v), i)
     end
+
+    # Less accurate division for Float32 than Base Julia which relies on Float64
+    # https://github.com/JuliaLang/julia/pull/49637
+    @device_override Base.div(x::Float32, y::Float32) = trunc(x / y)
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7f3d2e55..fd1a2c71 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -16,7 +16,7 @@
     # perform a reduction
     d = 1
     while d < items
-        barrier()
+        barrier(0)
         index = 2 * d * (item-1) + 1
         @inbounds if index <= items
             other_val = if index + d <= items
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index b7f8b527..9e39fa9f 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -30,6 +30,7 @@ functional() = oneL0.functional[]
 import SPIRVIntrinsics
 SPIRVIntrinsics.@import_all
 SPIRVIntrinsics.@reexport_public
+Base.Experimental.@MethodTable(method_table)
 include("device/runtime.jl")
 include("device/array.jl")
 include("device/quirks.jl")
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index 66729b57..2fd144ad 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -1,7 +1,7 @@
 module oneAPIKernels
 
 using ..oneAPI
-using ..oneAPI: @device_override
+using ..oneAPI: @device_override, SPIRVIntrinsics, method_table
 
 import KernelAbstractions as KA
 
@@ -161,7 +161,7 @@ end
 ## Synchronization and Printing
 
 @device_override @inline function KA.__synchronize()
-    barrier()
+    barrier(0)
 end
 
 @device_override @inline function KA.__print(args...)
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
index 713d55ef..5e5605ef 100644
--- a/test/device/intrinsics.jl
+++ b/test/device/intrinsics.jl
@@ -226,7 +226,7 @@ end
 
         s[t] = d[t]
         s2[t] = 2*d[t]
-        barrier()
+        barrier(0)
         d[t] = s[tr]
 
         return
@@ -252,7 +252,7 @@ end
 
             s[t] = d[t]
             s2[t] = d[t]
-            barrier()
+            barrier(0)
             d[t] = s[tr]
 
             return
diff --git a/test/execution.jl b/test/execution.jl
index 596d0d8f..cd3db014 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -569,18 +569,18 @@ end
 
         r[tx] = r_[tx]
 
-        barrier()
+        barrier(0)
 
         for j=1:n
             if tx == 1
                 r[j] = r[j] / 2f0
             end
-            barrier()
+            barrier(0)
 
             if tx > j && tx <= 4
                 r[tx] = r[tx] - 2f0*r[j]
             end
-            barrier()
+            barrier(0)
         end
 
         if bx == 1
@@ -606,7 +606,7 @@ end
     # conversions from integers to pointers resulted in lost memory stores
 
     function kernel(ptr)
-        ptr = reinterpret(Core.LLVMPtr{Float32, AS.Global}, ptr)
+        ptr = reinterpret(Core.LLVMPtr{Float32, AS.CrossWorkgroup}, ptr)
         unsafe_store!(ptr, 42)
         return
     end