diff --git a/docs/src/api.md b/docs/src/api.md
index 9373d231a..4e107075b 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -21,6 +21,7 @@ allocate
 
 ```@docs
 KernelAbstractions.zeros
+KernelAbstractions.supports_unified
 ```
 
 ## Internal
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 15757e3a2..64e213a5b 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -524,47 +524,74 @@ end
 #   adapt_storage(::Backend, a::BackendArray) = a
 
 """
-    allocate(::Backend, Type, dims...)::AbstractArray
+    allocate(::Backend, Type, dims...; unified=false)::AbstractArray
 
-Allocate a storage array appropriate for the computational backend.
+Allocate a storage array appropriate for the computational backend. `unified=true`
+allocates an array using unified memory if the backend supports it and throws otherwise.
+Use [`supports_unified`](@ref) to determine whether it is supported by a backend.
 
 !!! note
     Backend implementations **must** implement `allocate(::NewBackend, T, dims::Tuple)`
-"""
-allocate(backend::Backend, T::Type, dims...) = allocate(backend, T, dims)
-allocate(backend::Backend, T::Type, dims::Tuple) = throw(MethodError(allocate, (backend, T, dims)))
+    Backend implementations **should** implement `allocate(::NewBackend, T, dims::Tuple; unified::Bool=false)`
+"""
+allocate(backend::Backend, T::Type, dims...; kwargs...) = allocate(backend, T, dims; kwargs...)
+function allocate(backend::Backend, T::Type, dims::Tuple; unified::Union{Nothing, Bool} = nothing)
+    if isnothing(unified)
+        throw(MethodError(allocate, (backend, T, dims)))
+    elseif unified
+        throw(ArgumentError("`$(typeof(backend))` does not support unified memory. If you believe it does, please open a github issue."))
+    else
+        return allocate(backend, T, dims)
+    end
+end
+
 
 """
-    zeros(::Backend, Type, dims...)::AbstractArray
+    zeros(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with zeros.
+`unified=true` allocates an array using unified memory if the backend supports it and
+throws otherwise.
 """
-zeros(backend::Backend, T::Type, dims...) = zeros(backend, T, dims)
-function zeros(backend::Backend, ::Type{T}, dims::Tuple) where {T}
-    data = allocate(backend, T, dims...)
+zeros(backend::Backend, T::Type, dims...; kwargs...) = zeros(backend, T, dims; kwargs...)
+function zeros(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    data = allocate(backend, T, dims...; kwargs...)
     fill!(data, zero(T))
     return data
 end
 
 """
-    ones(::Backend, Type, dims...)::AbstractArray
+    ones(::Backend, Type, dims...; unified=false)::AbstractArray
 
 Allocate a storage array appropriate for the computational backend filled with ones.
+`unified=true` allocates an array using unified memory if the backend supports it and
+throws otherwise.
 """
-ones(backend::Backend, T::Type, dims...) = ones(backend, T, dims)
-function ones(backend::Backend, ::Type{T}, dims::Tuple) where {T}
-    data = allocate(backend, T, dims)
+ones(backend::Backend, T::Type, dims...; kwargs...) = ones(backend, T, dims; kwargs...)
+function ones(backend::Backend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    data = allocate(backend, T, dims; kwargs...)
     fill!(data, one(T))
     return data
 end
 
+"""
+    supports_unified(::Backend)::Bool
+
+Returns whether unified memory arrays are supported by the backend.
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they **do** support unified memory.
+"""
+supports_unified(::Backend) = false
+
 """
     supports_atomics(::Backend)::Bool
 
 Returns whether `@atomic` operations are supported by the backend.
 
 !!! note
-    Backend implementations **must** implement this function,
+    Backend implementations **must** implement this function
     only if they **do not** support atomic operations with Atomix.
 """
 supports_atomics(::Backend) = true
@@ -575,7 +602,7 @@ supports_atomics(::Backend) = true
 Returns whether `Float64` values are supported by the backend.
 
 !!! note
-    Backend implementations **must** implement this function,
+    Backend implementations **must** implement this function
     only if they **do not** support `Float64`.
 """
 supports_float64(::Backend) = true
diff --git a/src/cpu.jl b/src/cpu.jl
deleted file mode 100644
index e383386f7..000000000
--- a/src/cpu.jl
+++ /dev/null
@@ -1,224 +0,0 @@
-synchronize(::CPU) = nothing
-
-allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
-
-function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
-    kernel = init_kernel(backend)
-    kernel(arr, zero, T, ndrange = length(arr))
-    return arr
-end
-function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T}
-    arr = allocate(backend, T, dims)
-    kernel = init_kernel(backend)
-    kernel(arr, one, T; ndrange = length(arr))
-    return arr
-end
-
-function copyto!(backend::CPU, A, B)
-    if get_backend(A) == get_backend(B) && get_backend(A) isa CPU
-        if length(A) != length(B)
-            error("Arrays must match in length")
-        end
-        if Base.mightalias(A, B)
-            error("Arrays may not alias")
-        end
-        kernel = copy_kernel(backend)
-        kernel(A, B, ndrange = length(A))
-        return A
-    else
-        return Base.copyto!(A, B)
-    end
-end
-
-functional(::CPU) = true
-pagelock!(::CPU, x) = nothing
-
-function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
-    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
-
-    if length(blocks(iterspace)) == 0
-        return nothing
-    end
-
-    __run(obj, ndrange, iterspace, args, dynamic, obj.backend.static)
-    return nothing
-end
-
-const CPU_GRAINSIZE = 1024 # Vectorization, 4x unrolling, minimal grain size
-function default_cpu_workgroupsize(ndrange)
-    # if the total kernel is small, don't launch multiple tasks
-    n = prod(ndrange)
-    if iszero(n)
-        # If the ndrange is zero return a workgroupsize of (1, 1,...)
-        return map(one, ndrange)
-    elseif n <= CPU_GRAINSIZE
-        return ndrange
-    else
-        available = Ref(CPU_GRAINSIZE)
-        return ntuple(length(ndrange)) do i
-            dim = ndrange[i]
-            remaining = available[]
-            if remaining == 0
-                return 1
-            elseif remaining <= dim
-                available[] = 0
-                return remaining
-            else
-                available[] = remaining ÷ dim
-                return dim
-            end
-        end
-    end
-end
-
-@inline function launch_config(kernel::Kernel{CPU}, ndrange, workgroupsize)
-    if ndrange isa Integer
-        ndrange = (ndrange,)
-    end
-    if workgroupsize isa Integer
-        workgroupsize = (workgroupsize,)
-    end
-
-    if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
-        workgroupsize = default_cpu_workgroupsize(ndrange)
-    end
-    iterspace, dynamic = partition(kernel, ndrange, workgroupsize)
-    # partition checked that the ndrange's agreed
-    if KernelAbstractions.ndrange(kernel) <: StaticSize
-        ndrange = nothing
-    end
-
-    return ndrange, workgroupsize, iterspace, dynamic
-end
-
-# Inference barriers
-function __run(obj, ndrange, iterspace, args, dynamic, static_threads)
-    N = length(iterspace)
-    Nthreads = Threads.nthreads()
-    if Nthreads == 1
-        len, rem = N, 0
-    else
-        len, rem = divrem(N, Nthreads)
-    end
-    # not enough iterations for all the threads?
-    if len == 0
-        Nthreads = N
-        len, rem = 1, 0
-    end
-    if Nthreads == 1
-        __thread_run(1, len, rem, obj, ndrange, iterspace, args, dynamic)
-    else
-        if static_threads
-            Threads.@threads :static for tid in 1:Nthreads
-                __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        else
-            @sync for tid in 1:Nthreads
-                Threads.@spawn __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-            end
-        end
-    end
-    return nothing
-end
-
-function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
-    # compute this thread's iterations
-    f = 1 + ((tid - 1) * len)
-    l = f + len - 1
-    # distribute remaining iterations evenly
-    if rem > 0
-        if tid <= rem
-            f = f + (tid - 1)
-            l = l + tid
-        else
-            f = f + rem
-            l = l + rem
-        end
-    end
-    # run this thread's iterations
-    for i in f:l
-        block = @inbounds blocks(iterspace)[i]
-        ctx = mkcontext(obj, block, ndrange, iterspace, dynamic)
-        obj.f(ctx, args...)
-    end
-    return nothing
-end
-
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic}
-    return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
-end
-
-@inline function __index_Local_Linear(ctx, idx::CartesianIndex)
-    indices = workitems(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[idx]
-end
-
-@inline function __index_Group_Linear(ctx, idx::CartesianIndex)
-    indices = blocks(__iterspace(ctx))
-    return @inbounds LinearIndices(indices)[__groupindex(ctx)]
-end
-
-@inline function __index_Global_Linear(ctx, idx::CartesianIndex)
-    I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-    return @inbounds LinearIndices(__ndrange(ctx))[I]
-end
-
-@inline function __index_Local_Cartesian(_, idx::CartesianIndex)
-    return idx
-end
-
-@inline function __index_Group_Cartesian(ctx, ::CartesianIndex)
-    return __groupindex(ctx)
-end
-
-@inline function __index_Global_Cartesian(ctx, idx::CartesianIndex)
-    return @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-end
-
-@inline function __validindex(ctx, idx::CartesianIndex)
-    # Turns this into a noop for code where we can turn of checkbounds of
-    if __dynamic_checkbounds(ctx)
-        I = @inbounds expand(__iterspace(ctx), __groupindex(ctx), idx)
-        return I in __ndrange(ctx)
-    else
-        return true
-    end
-end
-
-###
-# CPU implementation of shared memory
-###
-@inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val) where {T, Dims}
-    return MArray{__size(Dims), T}(undef)
-end
-
-###
-# CPU implementation of scratch memory
-# - memory allocated as a MArray with size `Dims`
-###
-
-struct ScratchArray{N, D}
-    data::D
-    ScratchArray{N}(data::D) where {N, D} = new{N, D}(data)
-end
-
-@inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
-    return ScratchArray{length(Dims)}(MArray{__size((Dims..., prod(__groupsize(ctx)))), T}(undef))
-end
-
-# Base.view creates a boundscheck which captures A
-# https://github.com/JuliaLang/julia/issues/39308
-@inline function aview(A, I::Vararg{Any, N}) where {N}
-    J = Base.to_indices(A, I)
-    return Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
-end
-
-@inline function Base.getindex(A::ScratchArray{N}, idx) where {N}
-    return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx)
-end
-
-# Argument conversion
-argconvert(k::Kernel{CPU}, arg) = arg
-
-supports_enzyme(::CPU) = true
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 8e7fcc083..e733cae37 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -21,16 +21,16 @@ end
 
 ## Memory Operations
 
-KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
+KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple; unified::Bool = false) where {T} = Array{T}(undef, dims)
 
-function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T}
-    arr = KA.allocate(backend, T, dims)
+function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    arr = KA.allocate(backend, T, dims; kwargs...)
     kernel = KA.init_kernel(backend)
     kernel(arr, zero, T, ndrange = length(arr))
     return arr
 end
-function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T}
-    arr = KA.allocate(backend, T, dims)
+function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple; kwargs...) where {T}
+    arr = KA.allocate(backend, T, dims; kwargs...)
     kernel = KA.init_kernel(backend)
     kernel(arr, one, T; ndrange = length(arr))
     return arr
@@ -58,6 +58,7 @@ KA.pagelock!(::POCLBackend, x) = nothing
 KA.get_backend(::Array) = POCLBackend()
 KA.synchronize(::POCLBackend) = nothing
 KA.supports_float64(::POCLBackend) = true
+KA.supports_unified(::POCLBackend) = true
 
 
 ## Kernel Launch
diff --git a/test/test.jl b/test/test.jl
index 53126e881..4f7f9c7af 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -77,6 +77,15 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
         backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
         @test backend isa backendT
 
+        unified = KernelAbstractions.supports_unified(backend)
+        @test unified isa Bool
+        U = allocate(backend, Float32, 5; unified)
+        if unified
+            @test U[3] isa Float32
+        else
+            @test_throws ErrorException U[3]
+        end
+
         x = allocate(backend, Float32, 5)
         A = allocate(backend, Float32, 5, 5)
         @test @inferred(KernelAbstractions.get_backend(A)) isa backendT