From a73cf9705d54f9867f769d46977e09ecaf58b3ff Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 20 Jul 2025 17:51:29 -0300
Subject: [PATCH 01/18] Typos

---
 src/KernelAbstractions.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 15757e3a..7a050932 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -564,7 +564,7 @@ end
 Returns whether `@atomic` operations are supported by the backend.
 
 !!! note
-    Backend implementations **must** implement this function,
+    Backend implementations **must** implement this function
     only if they **do not** support atomic operations with Atomix.
 """
 supports_atomics(::Backend) = true
@@ -575,7 +575,7 @@ supports_atomics(::Backend) = true
 Returns whether `Float64` values are supported by the backend.
 
 !!! note
-    Backend implementations **must** implement this function,
+    Backend implementations **must** implement this function
     only if they **do not** support `Float64`.
 """
 supports_float64(::Backend) = true

From 002de2d7934d925a5a9856c66a5897824423ac29 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 20 Jul 2025 18:09:30 -0300
Subject: [PATCH 02/18] POC `supports_unified`

---
 src/KernelAbstractions.jl | 39 +++++++++++++++++++++++++++------------
 test/test.jl              |  9 +++++++++
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 7a050932..70cf4aa5 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -524,40 +524,55 @@ end
 #   adapt_storage(::Backend, a::BackendArray) = a
 
 """
-    allocate(::Backend, Type, dims...)::AbstractArray
+    allocate(::Backend, Type, dims...; unified=false)::AbstractArray
 
-Allocate a storage array appropriate for the computational backend.
+Allocate a storage array appropriate for the computational backend. `unified`
+allocates an array using unified memory if the backend supports it. Use
+[`supports_unified`](@ref) to determine whether it is supported by a backend.
 
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
 """
-allocate(backend::Backend, T::Type, dims...) = allocate(backend, T, dims)
-allocate(backend::Backend, T::Type, dims::Tuple) = throw(MethodError(allocate, (backend, T, dims)))
+allocate(backend::Backend, T::Type, dims...; unified=false) = allocate(backend, T, dims; unified)
+allocate(backend::Backend, T::Type, dims::Tuple; unified=false) = throw(MethodError(allocate, (backend, T, dims)))
 
 """
-    zeros(::Backend, Type, dims...)::AbstractArray
+    zeros(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with zeros.
+`unified` allocates an array using unified memory if the backend supports it.
 """
-zeros(backend::Backend, T::Type, dims...) = zeros(backend, T, dims)
-function zeros(backend::Backend, ::Type{T}, dims::Tuple) where {T}
-    data = allocate(backend, T, dims...)
+zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...)
+function zeros(backend::Backend, ::Type{T}, dims::Tuple; unified=false) where {T}
+    data = allocate(backend, T, dims...; unified)
     fill!(data, zero(T))
     return data
 end
 
 """
-    ones(::Backend, Type, dims...)::AbstractArray
+    ones(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with ones.
+`unified` allocates an array using unified memory if the backend supports it.
 """
-ones(backend::Backend, T::Type, dims...) = ones(backend, T, dims)
-function ones(backend::Backend, ::Type{T}, dims::Tuple) where {T}
-    data = allocate(backend, T, dims)
+ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...)
+function ones(backend::Backend, ::Type{T}, dims::Tuple; unified=false) where {T}
+    data = allocate(backend, T, dims; unified)
     fill!(data, one(T))
     return data
 end
 
+"""
+    supports_unified(::Backend)::Bool
+
+Returns whether unified memory arrays are supported by the backend.
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they **do not** support unified memory.
+"""
+supports_unified(::Backend) = true
+
 """
     supports_atomics(::Backend)::Bool
 
diff --git a/test/test.jl b/test/test.jl
index 53126e88..be2bb4b3 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -77,6 +77,15 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
         backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
         @test backend isa backendT
 
+        unified = supports_unified(backend)
+        @test unified isa Bool
+        U = allocate(backend, Float32, 5; unified)
+        if unified
+            @test U[3] isa Float32
+        else
+            @test_throws U[3]
+        end
+
         x = allocate(backend, Float32, 5)
         A = allocate(backend, Float32, 5, 5)
         @test @inferred(KernelAbstractions.get_backend(A)) isa backendT

From 6c03b08c9b91f659e669768a57cc272098acddca Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 20 Jul 2025 18:17:31 -0300
Subject: [PATCH 03/18] Formatting

---
 src/KernelAbstractions.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 70cf4aa5..0ccd564a 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -533,8 +533,8 @@ allocates an array using unified memory if the backend supports it. Use
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
 """
-allocate(backend::Backend, T::Type, dims...; unified=false) = allocate(backend, T, dims; unified)
-allocate(backend::Backend, T::Type, dims::Tuple; unified=false) = throw(MethodError(allocate, (backend, T, dims)))
+allocate(backend::Backend, T::Type, dims...; unified = false) = allocate(backend, T, dims; unified)
+allocate(backend::Backend, T::Type, dims::Tuple; unified = false) = throw(MethodError(allocate, (backend, T, dims)))
 
 """
     zeros(::Backend, Type, dims...; unified=false)::AbstractArray
@@ -543,7 +543,7 @@ Allocate a storage array appropriate for the computational backend filled with z
 `unified` allocates an array using unified memory if the backend supports it.
 """
 zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...)
-function zeros(backend::Backend, ::Type{T}, dims::Tuple; unified=false) where {T}
+function zeros(backend::Backend, ::Type{T}, dims::Tuple; unified = false) where {T}
     data = allocate(backend, T, dims...; unified)
     fill!(data, zero(T))
     return data
@@ -556,7 +556,7 @@ Allocate a storage array appropriate for the computational backend filled with o
 `unified` allocates an array using unified memory if the backend supports it.
 """
 ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...)
-function ones(backend::Backend, ::Type{T}, dims::Tuple; unified=false) where {T}
+function ones(backend::Backend, ::Type{T}, dims::Tuple; unified = false) where {T}
     data = allocate(backend, T, dims; unified)
     fill!(data, one(T))
     return data

From bdb7ad9ec0afc71e7c79533bc3ff24f34441bab0 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 21 Jul 2025 14:58:26 -0300
Subject: [PATCH 04/18] Address feedback

---
 src/KernelAbstractions.jl | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 0ccd564a..dd0918a3 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -533,8 +533,17 @@ allocates an array using unified memory if the backend supports it. Use
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
 """
-allocate(backend::Backend, T::Type, dims...; unified = false) = allocate(backend, T, dims; unified)
-allocate(backend::Backend, T::Type, dims::Tuple; unified = false) = throw(MethodError(allocate, (backend, T, dims)))
+allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...)
+function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing)
+    if isnothing(unified)
+        throw(MethodError(allocate, (backend, T, dims)))
+    elseif unified
+        throw(ArgumentError("`$(typeof(backend))` either does not support unified memory or it has not yet defined `allocate(backend::$backend, T::Type, dims::Tuple; unified::Bool)`"))
+    else
+        allocate(backend, T, dims)
+    end
+end
+
 
 """
     zeros(::Backend, Type, dims...; unified=false)::AbstractArray
@@ -543,8 +552,8 @@ Allocate a storage array appropriate for the computational backend filled with z
 `unified` allocates an array using unified memory if the backend supports it.
 """
 zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...)
-function zeros(backend::Backend, ::Type{T}, dims::Tuple; unified = false) where {T}
-    data = allocate(backend, T, dims...; unified)
+function zeros(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    data = allocate(backend, T, dims...; kwargs...)
     fill!(data, zero(T))
     return data
 end
@@ -556,8 +565,8 @@ Allocate a storage array appropriate for the computational backend filled with o
 `unified` allocates an array using unified memory if the backend supports it.
 """
 ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...)
-function ones(backend::Backend, ::Type{T}, dims::Tuple; unified = false) where {T}
-    data = allocate(backend, T, dims; unified)
+function ones(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    data = allocate(backend, T, dims; kwargs...)
     fill!(data, one(T))
     return data
 end
@@ -569,9 +578,9 @@ Returns whether unified memory arrays are supported by the backend.
 
 !!! note
     Backend implementations **must** implement this function
-    only if they **do not** support unified memory.
+    only if they **do** support unified memory.
 """
-supports_unified(::Backend) = true
+supports_unified(::Backend) = false
 
 """
     supports_atomics(::Backend)::Bool

From 4c0c6da5c59432601d57d461211c4d8a7b1579ce Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 22 Jul 2025 11:25:00 +0200
Subject: [PATCH 05/18] add CPU definition.

---
 src/cpu.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/cpu.jl b/src/cpu.jl
index e383386f..dcfd4f9f 100644
--- a/src/cpu.jl
+++ b/src/cpu.jl
@@ -1,15 +1,15 @@
 synchronize(::CPU) = nothing
 
-allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
+allocate(::CPU, ::Type{T}, dims::Tupl; unified::Bool=false) where {T} = Array{T}(undef, dims)
 
-function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
+function zeros(backend::CPU, ::Type{T}, dims::Tuple, kwargs...) where {T}
+    arr = allocate(backend, T, dims; kwargs...)
     kernel = init_kernel(backend)
     kernel(arr, zero, T, ndrange = length(arr))
     return arr
 end
-function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
+function ones(backend::CPU, ::Type{T}, dims::Tuple, kwargs...) where {T}
+    arr = allocate(backend, T, dims; kwargs...)
     kernel = init_kernel(backend)
     kernel(arr, one, T; ndrange = length(arr))
     return arr
@@ -33,6 +33,7 @@ end
 
 functional(::CPU) = true
 pagelock!(::CPU, x) = nothing
+supports_unified(::CPU) = true
 
 function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
     ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)

From c4aeb1048c400fb28fa6ecc82c1f1cbecd9095fd Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:16:07 -0300
Subject: [PATCH 06/18] Finish CPU backend

---
 src/cpu.jl          |  4 ++--
 src/pocl/backend.jl | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cpu.jl b/src/cpu.jl
index dcfd4f9f..41185527 100644
--- a/src/cpu.jl
+++ b/src/cpu.jl
@@ -2,13 +2,13 @@ synchronize(::CPU) = nothing
 
 allocate(::CPU, ::Type{T}, dims::Tupl; unified::Bool=false) where {T} = Array{T}(undef, dims)
 
-function zeros(backend::CPU, ::Type{T}, dims::Tuple, kwargs...) where {T}
+function zeros(backend::CPU, ::Type{T}, dims::Tuple; kwargs...) where {T}
     arr = allocate(backend, T, dims; kwargs...)
     kernel = init_kernel(backend)
     kernel(arr, zero, T, ndrange = length(arr))
     return arr
 end
-function ones(backend::CPU, ::Type{T}, dims::Tuple, kwargs...) where {T}
+function ones(backend::CPU, ::Type{T}, dims::Tuple; kwargs...) where {T}
     arr = allocate(backend, T, dims; kwargs...)
     kernel = init_kernel(backend)
     kernel(arr, one, T; ndrange = length(arr))
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 8e7fcc08..ffa24746 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -21,16 +21,16 @@ end
 
 ## Memory Operations
 
-KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
+KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where {T} = Array{T}(undef, dims)
 
-function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T}
-    arr = KA.allocate(backend, T, dims)
+function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    arr = KA.allocate(backend, T, dims; kwargs...)
     kernel = KA.init_kernel(backend)
     kernel(arr, zero, T, ndrange = length(arr))
     return arr
 end
-function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T}
-    arr = KA.allocate(backend, T, dims)
+function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    arr = KA.allocate(backend, T, dims; kwargs...)
     kernel = KA.init_kernel(backend)
     kernel(arr, one, T; ndrange = length(arr))
     return arr

From 4eec3c6dc83fe086e55ac2d6614cba78311a28d1 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:05:20 -0300
Subject: [PATCH 07/18] Format & fix

---
 src/KernelAbstractions.jl | 2 +-
 test/test.jl              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index dd0918a3..620b31bc 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -535,7 +535,7 @@ allocates an array using unified memory if the backend supports it. Use
 """
 allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...)
 function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing)
-    if isnothing(unified)
+    return if isnothing(unified)
         throw(MethodError(allocate, (backend, T, dims)))
     elseif unified
         throw(ArgumentError("`$(typeof(backend))` either does not support unified memory or it has not yet defined `allocate(backend::$backend, T::Type, dims::Tuple; unified::Bool)`"))
diff --git a/test/test.jl b/test/test.jl
index be2bb4b3..4784d97b 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -83,7 +83,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
         if unified
             @test U[3] isa Float32
         else
-            @test_throws U[3]
+            @test_throws ErrorException U[3]
         end
 
         x = allocate(backend, Float32, 5)

From b892c4a5794dd10040ba5dee51755a2270d60562 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:18:50 -0300
Subject: [PATCH 08/18] Fix test

---
 test/test.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test.jl b/test/test.jl
index 4784d97b..4f7f9c7a 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -77,7 +77,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
         backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
         @test backend isa backendT
 
-        unified = supports_unified(backend)
+        unified = KernelAbstractions.supports_unified(backend)
         @test unified isa Bool
         U = allocate(backend, Float32, 5; unified)
         if unified

From be66b7128d2b3ed6f81a5ef27ebf0d23911933ce Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 21 Jul 2025 18:12:32 -0300
Subject: [PATCH 09/18] Format feedback

---
 src/KernelAbstractions.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 620b31bc..dc373f8c 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -535,12 +535,12 @@ allocates an array using unified memory if the backend supports it. Use
 """
 allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...)
 function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing)
-    return if isnothing(unified)
+    if isnothing(unified)
         throw(MethodError(allocate, (backend, T, dims)))
     elseif unified
         throw(ArgumentError("`$(typeof(backend))` either does not support unified memory or it has not yet defined `allocate(backend::$backend, T::Type, dims::Tuple; unified::Bool)`"))
     else
-        allocate(backend, T, dims)
+        return allocate(backend, T, dims)
     end
 end
 

From 9566220df713cc70ab493cb76a746fa490d8507e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 09:43:45 -0300
Subject: [PATCH 10/18] Update docstring and shorten error

---
 src/KernelAbstractions.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index dc373f8c..e9b65d17 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -532,13 +532,14 @@ allocates an array using unified memory if the backend supports it. Use
 
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
+    Backend implementations **should** implement `allocate(::NewBackend, T, dims::Tuple; unified::Bool=false)`
 """
 allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...)
 function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing)
     if isnothing(unified)
         throw(MethodError(allocate, (backend, T, dims)))
     elseif unified
-        throw(ArgumentError("`$(typeof(backend))` either does not support unified memory or it has not yet defined `allocate(backend::$backend, T::Type, dims::Tuple; unified::Bool)`"))
+        throw(ArgumentError("`$(typeof(backend))` does not support unified memory. If you believe it does, please open a github issue."))
     else
         return allocate(backend, T, dims)
     end

From 9ff6f529cdaddd680fe07de383ea10b0f2074f41 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:27:55 -0300
Subject: [PATCH 11/18] Enforcing unified support would be breaking

---
 src/KernelAbstractions.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index e9b65d17..5d43b04e 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -578,8 +578,9 @@ end
 Returns whether unified memory arrays are supported by the backend.
 
 !!! note
-    Backend implementations **must** implement this function
-    only if they **do** support unified memory.
+    Backend implementations **should** implement this function
+    only if they **do** support unified memory. It will be required
+    in KernelAbstractions 0.10.
 """
 supports_unified(::Backend) = false
 

From 2ae1075ced2ce7735bead25e4c9a62d89678ef36 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:32:44 -0300
Subject: [PATCH 12/18] Feedback

---
 src/KernelAbstractions.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 5d43b04e..12b9e05b 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -579,8 +579,7 @@ Returns whether unified memory arrays are supported by the backend.
 
 !!! note
     Backend implementations **should** implement this function
-    only if they **do** support unified memory. It will be required
-    in KernelAbstractions 0.10.
+    only if they **do** support unified memory.
 """
 supports_unified(::Backend) = false
 

From 441702171ad1d3bc01f2601549ca937c49e99849 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:39:58 -0300
Subject: [PATCH 13/18] Fixes

---
 src/KernelAbstractions.jl | 2 +-
 src/cpu.jl                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 12b9e05b..e9b65d17 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -578,7 +578,7 @@ end
 Returns whether unified memory arrays are supported by the backend.
 
 !!! note
-    Backend implementations **should** implement this function
+    Backend implementations **must** implement this function
     only if they **do** support unified memory.
 """
 supports_unified(::Backend) = false
diff --git a/src/cpu.jl b/src/cpu.jl
index 41185527..2e9d7bd0 100644
--- a/src/cpu.jl
+++ b/src/cpu.jl
@@ -1,6 +1,6 @@
 synchronize(::CPU) = nothing
 
-allocate(::CPU, ::Type{T}, dims::Tupl; unified::Bool=false) where {T} = Array{T}(undef, dims)
+allocate(::CPU, ::Type{T}, dims::Tuple; unified::Bool=false) where {T} = Array{T}(undef, dims)
 
 function zeros(backend::CPU, ::Type{T}, dims::Tuple; kwargs...) where {T}
     arr = allocate(backend, T, dims; kwargs...)

From 5151302de56f399153a1bf78243917661e7b628c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:50:57 -0300
Subject: [PATCH 14/18] Remove unused file

---
 src/cpu.jl | 225 -----------------------------------------------------
 1 file changed, 225 deletions(-)
 delete mode 100644 src/cpu.jl

diff --git a/src/cpu.jl b/src/cpu.jl
deleted file mode 100644
index 2e9d7bd0..00000000
--- a/src/cpu.jl
+++ /dev/null
@@ -1,225 +0,0 @@
-synchronize(::CPU) = nothing
-
-allocate(::CPU, ::Type{T}, dims::Tuple; unified::Bool=false) where {T} = Array{T}(undef, dims)
-
-function zeros(backend::CPU, ::Type{T}, dims::Tuple; kwargs...) where {T}
-    arr = allocate(backend, T, dims; kwargs...)
-    kernel = init_kernel(backend)
-    kernel(arr, zero, T, ndrange = length(arr))
-    return arr
-end
-function ones(backend::CPU, ::Type{T}, dims::Tuple; kwargs...) where {T}
-    arr = allocate(backend, T, dims; kwargs...)
-    kernel = init_kernel(backend)
-    kernel(arr, one, T; ndrange = length(arr))
-    return arr
-end
-
-function copyto!(backend::CPU, A, B)
-    if get_backend(A) == get_backend(B) && get_backend(A) isa CPU
-        if length(A) != length(B)
-            error("Arrays must match in length")
-        end
-        if Base.mightalias(A, B)
-            error("Arrays may not alias")
-        end
-        kernel = copy_kernel(backend)
-        kernel(A, B, ndrange = length(A))
-        return A
-    else
-        return Base.copyto!(A, B)
-    end
-end
-
-functional(::CPU) = true
-pagelock!(::CPU, x) = nothing
-supports_unified(::CPU) = true
-
-function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
-    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
-
-    if length(blocks(iterspace)) == 0
-        return nothing
-    end
-
-    __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
-    return nothing
-end
-
-const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size
-function default_cpu_workgroupsize(ndrange)
-    # if the total kernel is small, don't launch multiple tasks
-    n = prod(ndrange)
-    if iszero(n)
-        # If the ndrange is zero return a workgroupsize of (1, 1,...)
-        return map(one, ndrange)
-    elseif n <= CPU_GRAINSIZE
-        return ndrange
-    else
-        available = Ref(CPU_GRAINSIZE)
-        return ntuple(length(ndrange)) do i
-            dim = ndrange[i]
-            remaining = available[]
-            if remaining == 0
-                return 1
-            elseif remaining <= dim
-                available[] = 0
-                return remaining
-            else
-                available[] = remaining ÷ dim
-                return dim
-            end
-        end
-    end
-end
-
-@inline function launch_config(kernel::Kernel{CPU}, ndrange, workgroupsize)
-    if ndrange isa Integer
-        ndrange = (ndrange,)
-    end
-    if workgroupsize isa Integer
-        workgroupsize = (workgroupsize,)
-    end
-
-    if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
-        workgroupsize = default_cpu_workgroupsize(ndrange)
-    end
-    iterspace, dynamic = partition(kernel, ndrange, workgroupsize)
-    # partition checked that the ndrange's agreed
-    if KernelAbstractions.ndrange(kernel) <: StaticSize
-        ndrange = nothing
-    end
-
-    return ndrange, workgroupsize, iterspace, dynamic
-end
-
-# Inference barriers
-function __run(obj, ndrange, iterspace, args, dynamic, static_threads)
-    N = length(iterspace)
-    Nthreads = Threads.nthreads()
-    if Nthreads == 1
-        len, rem = N, 0
-    else
-        len, rem = divrem(N, Nthreads)
-    end
-    # not enough iterations for all the threads?
-    if len == 0
-        Nthreads = N
-        len, rem = 1, 0
-    end
-    if Nthreads == 1
-        __thread_run(1, len, rem, obj, ndrange, iterspace, args, dynamic)
-    else
-        if static_threads
-            Threads.@threads :static for tid in 1:Nthreads
-                __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        else
-            @sync for tid in 1:Nthreads
-                Threads.@spawn __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        end
-    end
-    return nothing
-end
-
-function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-    # compute this thread's iterations
-    f = 1 + ((tid - 1) * len)
-    l = f + len - 1
-    # distribute remaining iterations evenly
-    if rem > 0
-        if tid <= rem
-            f = f + (tid - 1)
-            l = l + tid
-        else
-            f = f + rem
-            l = l + rem
-        end
-    end
-    # run this thread's iterations
-    for i in f:l
-        block = @inbounds blocks(iterspace)[i]
-        ctx = mkcontext(obj, block, ndrange, iterspace, dynamic)
-        obj.f(ctx, args...)
-    end
-    return nothing
-end
-
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic}
-    return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
-end
-
-@inline function __index_Local_Linear(ctx, idx::CartesianIndex)
-    indices = workitems(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[idx]
-end
-
-@inline function __index_Group_Linear(ctx, idx::CartesianIndex)
-    indices = blocks(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[__groupindex(ctx)]
-end
-
-@inline function __index_Global_Linear(ctx, idx::CartesianIndex)
-    I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-    return @inbounds LinearIndices(__ndrange(ctx))[I]
-end
-
-@inline function __index_Local_Cartesian(_, idx::CartesianIndex)
-    return idx
-end
-
-@inline function __index_Group_Cartesian(ctx, ::CartesianIndex)
-    return __groupindex(ctx)
-end
-
-@inline function __index_Global_Cartesian(ctx, idx::CartesianIndex)
-    return @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-end
-
-@inline function __validindex(ctx, idx::CartesianIndex)
-    # Turns this into a noop for code where we can turn of checkbounds of
-    if __dynamic_checkbounds(ctx)
-        I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-        return I in __ndrange(ctx)
-    else
-        return true
-    end
-end
-
-###
-# CPU implementation of shared memory
-###
-@inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims}
-    return MArray{__size(Dims), T}(undef)
-end
-
-###
-# CPU implementation of scratch memory
-# - memory allocated as a MArray with size `Dims`
-###
-
-struct ScratchArray{N, D}
-    data::D
-    ScratchArray{N}(data::D) where {N, D} = new{N, D}(data)
-end
-
-@inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
-    return ScratchArray{length(Dims)}(MArray{__size((Dims..., prod(__groupsize(ctx)))), T}(undef))
-end
-
-# Base.view creates a boundscheck which captures A
-# https://github.com/JuliaLang/julia/issues/39308
-@inline function aview(A, I::Vararg{Any, N}) where {N}
-    J = Base.to_indices(A, I)
-    return Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
-end
-
-@inline function Base.getindex(A::ScratchArray{N}, idx) where {N}
-    return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx)
-end
-
-# Argument conversion
-argconvert(k::Kernel{CPU}, arg) = arg
-
-supports_enzyme(::CPU) = true

From afe5818291ebd087b4b964bfa8a87a5233754d6e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:56:30 -0300
Subject: [PATCH 15/18] Fix cpu backend support

---
 src/pocl/backend.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index ffa24746..f12a4e9b 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -58,6 +58,7 @@ KA.pagelock!(::POCLBackend, x) = nothing
 KA.get_backend(::Array) = POCLBackend()
 KA.synchronize(::POCLBackend) = nothing
 KA.supports_float64(::POCLBackend) = true
+KA.supports_unified(::POCLBackend) = true
 
 
 ## Kernel Launch

From 2ad1d50b0a3c1d81e3b7473d9581da88ceb58e0d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:56:55 -0300
Subject: [PATCH 16/18] Update docstrings

---
 src/KernelAbstractions.jl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index e9b65d17..64e213a5 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -526,9 +526,9 @@ end
 """
     allocate(::Backend, Type, dims...; unified=false)::AbstractArray
 
-Allocate a storage array appropriate for the computational backend. `unified`
-allocates an array using unified memory if the backend supports it. Use
-[`supports_unified`](@ref) to determine whether it is supported by a backend.
+Allocate a storage array appropriate for the computational backend. `unified=true`
+allocates an array using unified memory if the backend supports it and throws otherwise.
+Use [`supports_unified`](@ref) to determine whether it is supported by a backend.
 
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
@@ -550,7 +550,8 @@ end
     zeros(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with zeros.
-`unified` allocates an array using unified memory if the backend supports it.
+`unified=true` allocates an array using unified memory if the backend supports it and
+throws otherwise.
 """
 zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...)
 function zeros(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}
@@ -563,7 +564,8 @@ end
     ones(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with ones.
-`unified` allocates an array using unified memory if the backend supports it.
+`unified=true` allocates an array using unified memory if the backend supports it and
+throws otherwise.
 """
 ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...)
 function ones(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}

From fa116c452407cda98675b61e15b5b92e84cf0fed Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:01:05 -0300
Subject: [PATCH 17/18] Format

---
 src/pocl/backend.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index f12a4e9b..e733cae3 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -21,7 +21,7 @@ end
 
 ## Memory Operations
 
-KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple; unified::Bool=false) where {T} = Array{T}(undef, dims)
+KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = Array{T}(undef, dims)
 
 function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T}
     arr = KA.allocate(backend, T, dims; kwargs...)

From 98ba8903b7fc2272b8727b0acd48a81fe3cd6547 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:44:06 -0300
Subject: [PATCH 18/18] Add `supports_unified` to docs

---
 docs/src/api.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/api.md b/docs/src/api.md
index 9373d231..4e107075 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -21,6 +21,7 @@ allocate
 
 ```@docs
 KernelAbstractions.zeros
+KernelAbstractions.supports_unified
 ```
 
 ## Internal