From 02f16948c6847d5d968cc1ec80c998d42eb7255a Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Thu, 4 Dec 2025 13:38:29 -0600
Subject: [PATCH 1/8] Update KA API

---
 src/oneAPIKernels.jl | 100 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 6 deletions(-)

diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index 2fd144ad..61f2b2d7 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -15,18 +15,24 @@ import Adapt
 export oneAPIBackend
 
 struct oneAPIBackend <: KA.GPU
+    prefer_blocks::Bool
+    always_inline::Bool
 end
 
-KA.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneArray{T}(undef, dims)
-KA.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.zeros(T, dims)
-KA.ones(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.ones(T, dims)
+oneAPIBackend(; prefer_blocks=false, always_inline=false) = oneAPIBackend(prefer_blocks, always_inline)
+
+@inline KA.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims)
+@inline KA.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), zero(T))
+@inline KA.ones(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), one(T))
 
 KA.get_backend(::oneArray) = oneAPIBackend()
 # TODO should be non-blocking
-KA.synchronize(::oneAPIBackend) = oneL0.synchronize()
+KA.synchronize(::oneAPIBackend) = oneAPI.oneL0.synchronize()
 KA.supports_float64(::oneAPIBackend) = false  # TODO: Check if this is device dependent
 
-Adapt.adapt_storage(::oneAPIBackend, a::Array) = Adapt.adapt(oneArray, a)
+KA.functional(::oneAPIBackend) = oneAPI.functional()
+
+Adapt.adapt_storage(::oneAPIBackend, a::AbstractArray) = Adapt.adapt(oneArray, a)
 Adapt.adapt_storage(::oneAPIBackend, a::oneArray) = a
 Adapt.adapt_storage(::KA.CPU, a::oneArray) = convert(Array, a)
 
@@ -39,6 +45,24 @@ function KA.copyto!(::oneAPIBackend, A, B)
 end
 
 
+## Device Operations
+
+function KA.ndevices(::oneAPIBackend)
+    return length(oneAPI.devices())
+end
+
+function KA.device(::oneAPIBackend)::Int
+    dev = oneAPI.device()
+    devs = oneAPI.devices()
+    idx = findfirst(==(dev), devs)
+    return idx === nothing ? 1 : idx
+end
+
+function KA.device!(backend::oneAPIBackend, id::Int)
+    oneAPI.device!(id)
+end
+
+
 ## Kernel Launch
 
 function KA.mkcontext(kernel::KA.Kernel{oneAPIBackend}, _ndrange, iterspace)
@@ -83,14 +107,42 @@ function threads_to_workgroupsize(threads, ndrange)
 end
 
 function (obj::KA.Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    backend = KA.backend(obj)
+
     ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, workgroupsize)
     # this might not be the final context, since we may tune the workgroupsize
     ctx = KA.mkcontext(obj, ndrange, iterspace)
-    kernel = @oneapi launch=false obj.f(ctx, args...)
+
+    # If the kernel is statically sized we can tell the compiler about that
+    if KA.workgroupsize(obj) <: KA.StaticSize
+        # TODO: maxthreads
+        # maxthreads = prod(KA.get(KA.workgroupsize(obj)))
+    else
+        # maxthreads = nothing
+    end
+
+    kernel = @oneapi launch=false always_inline=backend.always_inline obj.f(ctx, args...)
 
     # figure out the optimal workgroupsize automatically
     if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
         items = oneAPI.launch_configuration(kernel)
+
+        if backend.prefer_blocks
+            # Prefer blocks over threads:
+            # Reducing the workgroup size (items) increases the number of workgroups (blocks).
+            # We use a simple heuristic here since we lack full occupancy info (max_blocks) from launch_configuration.
+
+            # If the total range is large enough, full workgroups are fine.
+            # If the range is small, we might want to reduce 'items' to create more blocks to fill the GPU.
+            # (Simplified logic compared to CUDA.jl which uses explicit occupancy calculators)
+            total_items = prod(ndrange)
+            if total_items < items * 16 # Heuristic factor
+                 # Force at least a few blocks if possible by reducing items per block
+                 target_blocks = 16 # Target at least 16 blocks
+                 items = max(1, min(items, cld(total_items, target_blocks)))
+            end
+        end
+
         workgroupsize = threads_to_workgroupsize(items, ndrange)
         iterspace, dynamic = KA.partition(obj, ndrange, workgroupsize)
         ctx = KA.mkcontext(obj, ndrange, iterspace)
@@ -171,6 +223,42 @@ end
 
 ## Other
 
+Adapt.adapt_storage(to::KA.ConstAdaptor, a::oneDeviceArray) = Base.Experimental.Const(a)
+
 KA.argconvert(::KA.Kernel{oneAPIBackend}, arg) = kernel_convert(arg)
 
+function KA.priority!(::oneAPIBackend, prio::Symbol)
+    if !(prio in (:high, :normal, :low))
+        error("priority must be one of :high, :normal, :low")
+    end
+
+    priority_enum = if prio == :high
+        oneAPI.oneL0.ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH
+    elseif prio == :low
+        oneAPI.oneL0.ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW
+    else
+        oneAPI.oneL0.ZE_COMMAND_QUEUE_PRIORITY_NORMAL
+    end
+
+    ctx = oneAPI.context()
+    dev = oneAPI.device()
+
+    # Update the cached queue
+    # We synchronize the current queue first to ensure safety
+    current_queue = oneAPI.global_queue(ctx, dev)
+    oneAPI.oneL0.synchronize(current_queue)
+
+    # Replace the queue in task_local_storage
+    # The key used by global_queue is (:ZeCommandQueue, ctx, dev)
+
+    new_queue = oneAPI.oneL0.ZeCommandQueue(ctx, dev;
+        flags = oneAPI.oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER,
+        priority = priority_enum
+    )
+
+    task_local_storage((:ZeCommandQueue, ctx, dev), new_queue)
+
+    return nothing
+end
+
 end

From 324114a48581d8c0da72c5335f6114f622493ef8 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Fri, 12 Dec 2025 10:15:35 -0600
Subject: [PATCH 2/8] Support keys() and unsafe_cached_load()

---
 lib/level-zero/device.jl | 2 ++
 src/context.jl           | 8 +++++++-
 src/device/array.jl      | 7 +++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/level-zero/device.jl b/lib/level-zero/device.jl
index 07a72e17..58d5b2da 100644
--- a/lib/level-zero/device.jl
+++ b/lib/level-zero/device.jl
@@ -204,6 +204,8 @@ Base.length(iter::ZeDevices) = length(iter.handles)
 
 Base.IteratorSize(::ZeDevices) = Base.HasLength()
 
+Base.keys(iter::ZeDevices) = 1:length(iter)
+
 function Base.show(io::IO, ::MIME"text/plain", iter::ZeDevices)
     print(io, "ZeDevice iterator for $(length(iter)) devices")
     if !isempty(iter)
diff --git a/src/context.jl b/src/context.jl
index b0f9ff10..89a2f219 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -103,7 +103,13 @@ See also: [`device`](@ref), [`devices`](@ref)
 function device!(drv::ZeDevice)
     task_local_storage(:ZeDevice, drv)
 end
-device!(i::Int) = device!(devices(driver())[i])
+function device!(i::Int)
+    devs = devices(driver())
+    if i < 1 || i > length(devs)
+        throw(ArgumentError("Invalid device index $i (must be between 1 and $(length(devs)))"))
+    end
+    device!(devs[i])
+end
 
 const global_contexts = Dict{ZeDriver,ZeContext}()
 
diff --git a/src/device/array.jl b/src/device/array.jl
index ae339110..95fc911f 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -195,6 +195,13 @@ end
     end
 end
 
+@device_function @inline function unsafe_cached_load(ptr::LLVMPtr{T,A}, i::Integer, align::Val) where {T,A}
+    # For SPIR-V/Level Zero, we don't have explicit cache control intrinsics like CUDA's __ldg
+    # So we fall back to a regular unsafe_load. The SPIR-V compiler may still apply
+    # appropriate optimizations based on context.
+    unsafe_load(ptr, i, align)
+end
+
 @device_function @inline function const_arrayref(A::oneDeviceArray{T}, index::Integer) where {T}
     # simplified bounds check (see `arrayset`)
     #@boundscheck checkbounds(A, index)

From e723fdf0bae7b8e668a18e51fc61816bdbfbbf69 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Fri, 12 Dec 2025 12:15:30 -0600
Subject: [PATCH 3/8] Runic, KA 0.9.39, and support_unified()

---
 Project.toml         |  2 +-
 src/context.jl       |  2 +-
 src/device/array.jl  |  2 +-
 src/oneAPIKernels.jl | 22 ++++++++++++----------
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/Project.toml b/Project.toml
index d9925aae..abd8d7e0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -44,7 +44,7 @@ ExprTools = "0.1"
 GPUArrays = "11.2.1"
 GPUCompiler = "1.6"
 GPUToolbox = "0.1, 0.2, 0.3, 1"
-KernelAbstractions = "0.9.1"
+KernelAbstractions = "0.9.39"
 LLVM = "6, 7, 8, 9"
 NEO_jll = "=25.44.36015"
 Preferences = "1"
diff --git a/src/context.jl b/src/context.jl
index 89a2f219..710bf720 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -108,7 +108,7 @@ function device!(i::Int)
     if i < 1 || i > length(devs)
         throw(ArgumentError("Invalid device index $i (must be between 1 and $(length(devs)))"))
     end
-    device!(devs[i])
+    return device!(devs[i])
 end
 
 const global_contexts = Dict{ZeDriver,ZeContext}()
diff --git a/src/device/array.jl b/src/device/array.jl
index 95fc911f..24822656 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -195,7 +195,7 @@ end
     end
 end
 
-@device_function @inline function unsafe_cached_load(ptr::LLVMPtr{T,A}, i::Integer, align::Val) where {T,A}
+@device_function @inline function unsafe_cached_load(ptr::LLVMPtr{T, A}, i::Integer, align::Val) where {T, A}
     # For SPIR-V/Level Zero, we don't have explicit cache control intrinsics like CUDA's __ldg
     # So we fall back to a regular unsafe_load. The SPIR-V compiler may still apply
     # appropriate optimizations based on context.
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index 61f2b2d7..6e092397 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -19,16 +19,17 @@ struct oneAPIBackend <: KA.GPU
     always_inline::Bool
 end
 
-oneAPIBackend(; prefer_blocks=false, always_inline=false) = oneAPIBackend(prefer_blocks, always_inline)
+oneAPIBackend(; prefer_blocks = false, always_inline = false) = oneAPIBackend(prefer_blocks, always_inline)
 
-@inline KA.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims)
-@inline KA.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), zero(T))
-@inline KA.ones(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where T = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), one(T))
+@inline KA.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims)
+@inline KA.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), zero(T))
+@inline KA.ones(::oneAPIBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = fill!(oneArray{T, length(dims), unified ? oneAPI.oneL0.SharedBuffer : oneAPI.oneL0.DeviceBuffer}(undef, dims), one(T))
 
 KA.get_backend(::oneArray) = oneAPIBackend()
 # TODO should be non-blocking
 KA.synchronize(::oneAPIBackend) = oneAPI.oneL0.synchronize()
 KA.supports_float64(::oneAPIBackend) = false  # TODO: Check if this is device dependent
+KA.supports_unified(::oneAPIBackend) = true
 
 KA.functional(::oneAPIBackend) = oneAPI.functional()
 
@@ -59,7 +60,7 @@ function KA.device(::oneAPIBackend)::Int
 end
 
 function KA.device!(backend::oneAPIBackend, id::Int)
-    oneAPI.device!(id)
+    return oneAPI.device!(id)
 end
 
 
@@ -121,7 +122,7 @@ function (obj::KA.Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize
         # maxthreads = nothing
     end
 
-    kernel = @oneapi launch=false always_inline=backend.always_inline obj.f(ctx, args...)
+    kernel = @oneapi launch = false always_inline = backend.always_inline obj.f(ctx, args...)
 
     # figure out the optimal workgroupsize automatically
     if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
@@ -137,9 +138,9 @@ function (obj::KA.Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize
             # (Simplified logic compared to CUDA.jl which uses explicit occupancy calculators)
             total_items = prod(ndrange)
             if total_items < items * 16 # Heuristic factor
-                 # Force at least a few blocks if possible by reducing items per block
-                 target_blocks = 16 # Target at least 16 blocks
-                 items = max(1, min(items, cld(total_items, target_blocks)))
+                # Force at least a few blocks if possible by reducing items per block
+                target_blocks = 16 # Target at least 16 blocks
+                items = max(1, min(items, cld(total_items, target_blocks)))
             end
         end
 
@@ -251,7 +252,8 @@ function KA.priority!(::oneAPIBackend, prio::Symbol)
     # Replace the queue in task_local_storage
     # The key used by global_queue is (:ZeCommandQueue, ctx, dev)
 
-    new_queue = oneAPI.oneL0.ZeCommandQueue(ctx, dev;
+    new_queue = oneAPI.oneL0.ZeCommandQueue(
+        ctx, dev;
         flags = oneAPI.oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER,
         priority = priority_enum
     )

From 6bf7cde6c404898c385293542e3ea7fb27ef2bd7 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Fri, 12 Dec 2025 14:01:57 -0600
Subject: [PATCH 4/8] Circular dependency in 1.10

---
 test/Project.toml | 1 -
 test/setup.jl     | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index cb603629..c214ed96 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,5 @@
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
diff --git a/test/setup.jl b/test/setup.jl
index a3b0f1a4..269d5b9c 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,4 +1,4 @@
-using Distributed, Test, oneAPI, AcceleratedKernels
+using Distributed, Test, oneAPI
 
 oneAPI.functional() || error("oneAPI.jl is not functional on this system")
 

From 16784d7a58cef12d9e772ea58e5b82d70a9b3bae Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Fri, 12 Dec 2025 14:21:21 -0600
Subject: [PATCH 5/8] Again deps

---
 Project.toml                                            | 3 ++-
 ext/oneAPIAcceleratedKernelsExt.jl => src/accumulate.jl | 4 ----
 src/oneAPI.jl                                           | 1 +
 3 files changed, 3 insertions(+), 5 deletions(-)
 rename ext/oneAPIAcceleratedKernelsExt.jl => src/accumulate.jl (92%)

diff --git a/Project.toml b/Project.toml
index abd8d7e0..f43159b8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Scha
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -37,7 +38,7 @@ oneAPIAcceleratedKernelsExt = "AcceleratedKernels"
 
 [compat]
 AbstractFFTs = "1.5.0"
-AcceleratedKernels = "0.4.3"
+AcceleratedKernels = "0.3.1, 0.4"
 Adapt = "4"
 CEnum = "0.4, 0.5"
 ExprTools = "0.1"
diff --git a/ext/oneAPIAcceleratedKernelsExt.jl b/src/accumulate.jl
similarity index 92%
rename from ext/oneAPIAcceleratedKernelsExt.jl
rename to src/accumulate.jl
index 6bb96180..f893dc96 100644
--- a/ext/oneAPIAcceleratedKernelsExt.jl
+++ b/src/accumulate.jl
@@ -1,5 +1,3 @@
-module oneAPIAcceleratedKernelsExt
-
 import oneAPI
 import oneAPI: oneArray, oneAPIBackend
 import AcceleratedKernels as AK
@@ -13,5 +11,3 @@ Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) =
 
 Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...)
 Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...)
-
-end # module
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index b9caa398..e10dd0d3 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -69,6 +69,7 @@ include("utils.jl")
 
 include("oneAPIKernels.jl")
 import .oneAPIKernels: oneAPIBackend
+include("accumulate.jl")
 include("indexing.jl")
 export oneAPIBackend
 

From eff363ded81ddba94aa248e9239fdb08894302ac Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Fri, 12 Dec 2025 14:31:25 -0600
Subject: [PATCH 6/8] Again deps

---
 Project.toml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index f43159b8..bd0179c2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "oneAPI"
 uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
-version = "2.6.0"
 authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Schanen <michel.schanen@gmail.com>"]
+version = "2.6.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -30,12 +30,6 @@ oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
 oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
 
-[weakdeps]
-AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
-
-[extensions]
-oneAPIAcceleratedKernelsExt = "AcceleratedKernels"
-
 [compat]
 AbstractFFTs = "1.5.0"
 AcceleratedKernels = "0.3.1, 0.4"

From a1436e14bb6f6ce81e4d3a21a1e2542b3f2ad587 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Mon, 15 Dec 2025 10:26:03 -0600
Subject: [PATCH 7/8] scalar indexing for unified arrays

---
 src/array.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/array.jl b/src/array.jl
index 4d621cf2..7a1e14cb 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -345,6 +345,19 @@ function Base.unsafe_convert(::Type{ZePtr{T}}, x::oneArray{T}) where {T}
 end
 
 
+## indexing
+
+# Host-accessible arrays can be indexed from CPU, bypassing GPUArrays restrictions
+function Base.getindex(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer,oneL0.SharedBuffer}}, I::Int)
+  @boundscheck checkbounds(x, I)
+  unsafe_load(pointer(x, I; type=oneL0.HostBuffer))
+end
+
+function Base.setindex!(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer,oneL0.SharedBuffer}}, v, I::Int)
+  @boundscheck checkbounds(x, I)
+  unsafe_store!(pointer(x, I; type=oneL0.HostBuffer), v)
+end
+
 
 ## interop with GPU arrays
 

From 493e06d749916d2731c55b923abbcfe2c98a3caf Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Mon, 15 Dec 2025 11:49:12 -0600
Subject: [PATCH 8/8] Runic

---
 src/array.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 7a1e14cb..f219ad5f 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -348,14 +348,14 @@ end
 ## indexing
 
 # Host-accessible arrays can be indexed from CPU, bypassing GPUArrays restrictions
-function Base.getindex(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer,oneL0.SharedBuffer}}, I::Int)
-  @boundscheck checkbounds(x, I)
-  unsafe_load(pointer(x, I; type=oneL0.HostBuffer))
+function Base.getindex(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer, oneL0.SharedBuffer}}, I::Int)
+    @boundscheck checkbounds(x, I)
+    return unsafe_load(pointer(x, I; type = oneL0.HostBuffer))
 end
 
-function Base.setindex!(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer,oneL0.SharedBuffer}}, v, I::Int)
-  @boundscheck checkbounds(x, I)
-  unsafe_store!(pointer(x, I; type=oneL0.HostBuffer), v)
+function Base.setindex!(x::oneArray{<:Any, <:Any, <:Union{oneL0.HostBuffer, oneL0.SharedBuffer}}, v, I::Int)
+    @boundscheck checkbounds(x, I)
+    return unsafe_store!(pointer(x, I; type = oneL0.HostBuffer), v)
 end