diff --git a/Project.toml b/Project.toml
index 3ea1895..6ab5499 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,42 +1,22 @@
 name = "DaggerGPU"
 uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
-[weakdeps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
-
-[extensions]
-CUDAExt = "CUDA"
-IntelExt = "oneAPI"
-MetalExt = "Metal"
-ROCExt = "AMDGPU"
-
 [compat]
-AMDGPU = "0.9.4"
 Adapt = "1, 2, 3, 4"
-CUDA = "3, 4, 5"
-Dagger = "0.18.12"
+Dagger = "0.18.17"
 KernelAbstractions = "0.9"
 MemPool = "0.3, 0.4"
-Metal = "1.1"
-oneAPI = "1"
 Requires = "1"
 julia = "1.7"
diff --git a/README.md b/README.md
index 5922938..51e04d6 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,12 @@
 
 **GPU integrations for Dagger.jl**
 
+## Deprecation Notice
+
+DaggerGPU's logic and extensions have been merged upstream into Dagger.jl, making DaggerGPU no longer necessary. You can now load CUDA logic with `using Dagger, CUDA`, and similarly for all other backends. Please report all issues to the [Dagger issue tracker](https://github.com/JuliaParallel/Dagger.jl/issues).
+
+## Original README
+
 DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA, AMD, and Apple GPUs, via CUDA.jl, AMDGPU.jl, and Metal.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl/Metal.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc`/`DaggerGPU.MtlArrayDeviceProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
 
 DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide:
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
deleted file mode 100644
index 400b1dd..0000000
--- a/ext/CUDAExt.jl
+++ /dev/null
@@ -1,344 +0,0 @@
-module CUDAExt
-
-export CuArrayDeviceProc
-
-import Dagger, DaggerGPU, MemPool
-import Dagger: CPURAMMemorySpace, Chunk, unwrap
-import MemPool: DRef, poolget
-import Distributed: myid, remotecall_fetch
-import LinearAlgebra
-using KernelAbstractions, Adapt
-
-const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
-
-if isdefined(Base, :get_extension)
-    import CUDA
-else
-    import ..CUDA
-end
-import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend
-import CUDA: devices, attribute, context, context!, stream, stream!
-import CUDA: CUBLAS, CUSOLVER
-
-using UUIDs
-
-"Represents a single CUDA GPU device."
-struct CuArrayDeviceProc <: Dagger.Processor
-    owner::Int
-    device::Int
-    device_uuid::UUID
-end
-Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner)
-Dagger.root_worker_id(proc::CuArrayDeviceProc) = proc.owner
-Base.show(io::IO, proc::CuArrayDeviceProc) =
-    print(io, "CuArrayDeviceProc(worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid))")
-Dagger.short_name(proc::CuArrayDeviceProc) = "W: $(proc.owner), CUDA: $(proc.device)"
-DaggerGPU.@gpuproc(CuArrayDeviceProc, CuArray)
-
-"Represents the memory space of a single CUDA GPU's VRAM."
-struct CUDAVRAMMemorySpace <: Dagger.MemorySpace
-    owner::Int
-    device::Int
-    device_uuid::UUID
-end
-Dagger.root_worker_id(space::CUDAVRAMMemorySpace) = space.owner
-function Dagger.memory_space(x::CuArray)
-    dev = CUDA.device(x)
-    device_id = dev.handle
-    device_uuid = CUDA.uuid(dev)
-    return CUDAVRAMMemorySpace(myid(), device_id, device_uuid)
-end
-
-Dagger.memory_spaces(proc::CuArrayDeviceProc) = Set([CUDAVRAMMemorySpace(proc.owner, proc.device, proc.device_uuid)])
-Dagger.processors(space::CUDAVRAMMemorySpace) = Set([CuArrayDeviceProc(space.owner, space.device, space.device_uuid)])
-
-function to_device(proc::CuArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return DEVICES[proc.device]
-end
-function to_context(proc::CuArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return CONTEXTS[proc.device]
-end
-to_context(handle::Integer) = CONTEXTS[handle]
-to_context(dev::CuDevice) = to_context(dev.handle)
-
-function with_context!(handle::Integer)
-    context!(CONTEXTS[handle])
-    stream!(STREAMS[handle])
-end
-function with_context!(proc::CuArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    with_context!(proc.device)
-end
-function with_context!(space::CUDAVRAMMemorySpace)
-    @assert Dagger.root_worker_id(space) == myid()
-    with_context!(space.device)
-end
-function with_context(f, x)
-    old_ctx = context()
-    old_stream = stream()
-
-    with_context!(x)
-    try
-        f()
-    finally
-        context!(old_ctx)
-        stream!(old_stream)
-    end
-end
-
-function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
-    if Dagger.root_worker_id(x) == myid()
-        with_context(CUDA.synchronize, x)
-    else
-        # Do nothing, as we have received our value over a serialization
-        # boundary, which should synchronize for us
-    end
-end
-
-# Allocations
-Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(rand)) = CUDA.rand
-Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(randn)) = CUDA.randn
-Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(ones)) = CUDA.ones
-Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(zeros)) = CUDA.zeros
-struct AllocateUndef{S} end
-(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CuArray{S,N}(undef, dims)
-Dagger.allocate_array_func(::CuArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
-
-# In-place
-# N.B. These methods assume that later operations will implicitly or
-# explicitly synchronize with their associated stream
-function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::CUDAVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    if Dagger.root_worker_id(from_space) == myid()
-        sync_with_context(from_space)
-        with_context!(from_space)
-    end
-    copyto!(to, from)
-    # N.B. DtoH will synchronize
-    return
-end
-function Dagger.move!(to_space::CUDAVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-function Dagger.move!(to_space::CUDAVRAMMemorySpace, from_space::CUDAVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    sync_with_context(from_space)
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-
-# Out-of-place HtoD
-function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x)
-    with_context(to_proc) do
-        arr = adapt(CuArray, x)
-        CUDA.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    cpu_data = remotecall_fetch(unwrap, from_w, x)
-    with_context(to_proc) do
-        arr = adapt(CuArray, cpu_data)
-        CUDA.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray)
-    if CUDA.device(x) == to_device(to_proc)
-        return x
-    end
-    with_context(to_proc) do
-        _x = similar(x)
-        copyto!(_x, x)
-        CUDA.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoH
-function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x)
-    with_context(from_proc) do
-        CUDA.synchronize()
-        _x = adapt(Array, x)
-        CUDA.synchronize()
-        return _x
-    end
-end
-function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    remotecall_fetch(from_w, x) do x
-        arr = unwrap(x)
-        return Dagger.move(from_proc, to_proc, arr)
-    end
-end
-function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{T,N}) where {T,N}
-    with_context(from_proc) do
-        CUDA.synchronize()
-        _x = Array{T,N}(undef, size(x))
-        copyto!(_x, x)
-        CUDA.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoD
-function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray
-    if from_proc == to_proc
-        # Same process and GPU, no change
-        arr = unwrap(x)
-        with_context(CUDA.synchronize, from_proc)
-        return arr
-    elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
-        # Same process but different GPUs, use DtoD copy
-        from_arr = unwrap(x)
-        with_context(CUDA.synchronize, from_proc)
-        return with_context(to_proc) do
-            to_arr = similar(from_arr)
-            copyto!(to_arr, from_arr)
-            CUDA.synchronize()
-            return to_arr
-        end
-    elseif Dagger.system_uuid(from_proc.owner) == Dagger.system_uuid(to_proc.owner) && from_proc.device_uuid == to_proc.device_uuid
-        # Same node, we can use IPC
-        ipc_handle, eT, shape = remotecall_fetch(from_proc.owner, x) do x
-            arr = unwrap(x)
-            ipc_handle_ref = Ref{CUDA.CUipcMemHandle}()
-            GC.@preserve arr begin
-                CUDA.cuIpcGetMemHandle(ipc_handle_ref, pointer(arr))
-            end
-            (ipc_handle_ref[], eltype(arr), size(arr))
-        end
-        r_ptr = Ref{CUDA.CUdeviceptr}()
-        CUDA.device!(from_proc.device) do
-            CUDA.cuIpcOpenMemHandle(r_ptr, ipc_handle, CUDA.CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
-        end
-        ptr = Base.unsafe_convert(CUDA.CuPtr{eT}, r_ptr[])
-        arr = unsafe_wrap(CuArray, ptr, shape; own=false)
-        finalizer(arr) do arr
-            CUDA.cuIpcCloseMemHandle(pointer(arr))
-        end
-        if from_proc.device_uuid != to_proc.device_uuid
-            return CUDA.device!(to_proc.device) do
-                to_arr = similar(arr)
-                copyto!(to_arr, arr)
-                to_arr
-            end
-        else
-            return arr
-        end
-    else
-        # Different node, use DtoH, serialization, HtoD
-        return CuArray(remotecall_fetch(from_proc.owner, x) do x
-            Array(unwrap(x))
-        end)
-    end
-end
-
-# Adapt generic functions
-Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Function) = x
-Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
-    Dagger.move(from_proc, to_proc, fetch(x))
-
-# Adapt BLAS/LAPACK functions
-import LinearAlgebra: BLAS, LAPACK
-for lib in [BLAS, LAPACK]
-    for name in names(lib; all=true)
-        name == nameof(lib) && continue
-        startswith(string(name), '#') && continue
-        endswith(string(name), '!') || continue
-
-        for culib in [CUBLAS, CUSOLVER]
-            if name in names(culib; all=true)
-                fn = getproperty(lib, name)
-                cufn = getproperty(culib, name)
-                @eval Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, ::$(typeof(fn))) = $cufn
-            end
-        end
-    end
-end
-
-# Task execution
-function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
-    @nospecialize f args kwargs
-    tls = Dagger.get_tls()
-    task = Threads.@spawn begin
-        Dagger.set_tls!(tls)
-        with_context!(proc)
-        result = Base.@invokelatest f(args...; kwargs...)
-        # N.B. Synchronization must be done when accessing result or args
-        return result
-    end
-
-    try
-        fetch(task)
-    catch err
-        stk = current_exceptions(task)
-        err, frames = stk[1]
-        rethrow(CapturedException(err, frames))
-    end
-end
-
-DaggerGPU.processor(::Val{:CUDA}) = CuArrayDeviceProc
-DaggerGPU.cancompute(::Val{:CUDA}) = CUDA.has_cuda()
-DaggerGPU.kernel_backend(::CuArrayDeviceProc) = CUDABackend()
-DaggerGPU.with_device(f, proc::CuArrayDeviceProc) =
-    CUDA.device!(f, proc.device)
-
-Dagger.to_scope(::Val{:cuda_gpu}, sc::NamedTuple) =
-    Dagger.to_scope(Val{:cuda_gpus}(), merge(sc, (;cuda_gpus=[sc.cuda_gpu])))
-Dagger.scope_key_precedence(::Val{:cuda_gpu}) = 1
-function Dagger.to_scope(::Val{:cuda_gpus}, sc::NamedTuple)
-    if haskey(sc, :worker)
-        workers = Int[sc.worker]
-    elseif haskey(sc, :workers) && sc.workers != Colon()
-        workers = sc.workers
-    else
-        workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context()))
-    end
-    scopes = Dagger.ExactScope[]
-    dev_ids = sc.cuda_gpus
-    for worker in workers
-        procs = Dagger.get_processors(Dagger.OSProc(worker))
-        for proc in procs
-            proc isa CuArrayDeviceProc || continue
-            if dev_ids == Colon() || proc.device+1 in dev_ids
-                scope = Dagger.ExactScope(proc)
-                push!(scopes, scope)
-            end
-        end
-    end
-    return Dagger.UnionScope(scopes)
-end
-Dagger.scope_key_precedence(::Val{:cuda_gpus}) = 1
-
-const DEVICES = Dict{Int, CuDevice}()
-const CONTEXTS = Dict{Int, CuContext}()
-const STREAMS = Dict{Int, CuStream}()
-
-function __init__()
-    if CUDA.has_cuda()
-        for dev in CUDA.devices()
-            @debug "Registering CUDA GPU processor with Dagger: $dev"
-            Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do
-                proc = CuArrayDeviceProc(myid(), dev.handle, CUDA.uuid(dev))
-                DEVICES[dev.handle] = dev
-                ctx = context(dev)
-                CONTEXTS[dev.handle] = ctx
-                context!(ctx) do
-                    STREAMS[dev.handle] = stream()
-                end
-                return proc
-            end
-        end
-    end
-end
-
-end # module CUDAExt
diff --git a/ext/IntelExt.jl b/ext/IntelExt.jl
deleted file mode 100644
index 74f3aa3..0000000
--- a/ext/IntelExt.jl
+++ /dev/null
@@ -1,312 +0,0 @@
-module IntelExt
-
-export oneArrayDeviceProc
-
-import Dagger, DaggerGPU, MemPool
-import Dagger: CPURAMMemorySpace, Chunk, unwrap
-import MemPool: DRef, poolget
-import Distributed: myid, remotecall_fetch
-import LinearAlgebra
-using KernelAbstractions, Adapt
-
-const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
-
-if isdefined(Base, :get_extension)
-    import oneAPI
-else
-    import ..oneAPI
-end
-import oneAPI: ZeDevice, ZeDriver, ZeContext, oneArray, oneAPIBackend
-import oneAPI: driver, driver!, device, device!, context, context!
-#import oneAPI: CUBLAS, CUSOLVER
-
-using UUIDs
-
-"Represents a single Intel GPU device."
-struct oneArrayDeviceProc <: Dagger.Processor
-    owner::Int
-    device_id::Int
-end
-Dagger.get_parent(proc::oneArrayDeviceProc) = Dagger.OSProc(proc.owner)
-Dagger.root_worker_id(proc::oneArrayDeviceProc) = proc.owner
-Base.show(io::IO, proc::oneArrayDeviceProc) =
-    print(io, "oneArrayDeviceProc(worker $(proc.owner), device $(proc.device_id))")
-Dagger.short_name(proc::oneArrayDeviceProc) = "W: $(proc.owner), oneAPI: $(proc.device)"
-DaggerGPU.@gpuproc(oneArrayDeviceProc, oneArray)
-
-"Represents the memory space of a single Intel GPU's VRAM."
-struct IntelVRAMMemorySpace <: Dagger.MemorySpace
-    owner::Int
-    device_id::Int
-end
-Dagger.root_worker_id(space::IntelVRAMMemorySpace) = space.owner
-function Dagger.memory_space(x::oneArray)
-    dev = oneAPI.device(x)
-    device_id = _device_id(dev)
-    return IntelVRAMMemorySpace(myid(), device_id)
-end
-_device_id(dev::ZeDevice) = findfirst(other_dev->other_dev === dev, collect(oneAPI.devices()))
-
-Dagger.memory_spaces(proc::oneArrayDeviceProc) = Set([IntelVRAMMemorySpace(proc.owner, proc.device_id)])
-Dagger.processors(space::IntelVRAMMemorySpace) = Set([oneArrayDeviceProc(space.owner, space.device_id)])
-
-function to_device(proc::oneArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return DEVICES[proc.device_id]
-end
-
-function with_context!(device_id::Integer)
-    driver!(DRIVERS[device_id])
-    device!(DEVICES[device_id])
-    context!(CONTEXTS[device_id])
-end
-function with_context!(proc::oneArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    with_context!(proc.device_id)
-end
-function with_context!(space::IntelVRAMMemorySpace)
-    @assert Dagger.root_worker_id(space) == myid()
-    with_context!(space.device_id)
-end
-function with_context(f, x)
-    old_drv = driver()
-    old_dev = device()
-    old_ctx = context()
-
-    with_context!(x)
-    try
-        f()
-    finally
-        driver!(old_drv)
-        device!(old_dev)
-        context!(old_ctx)
-    end
-end
-
-function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
-    if Dagger.root_worker_id(x) == myid()
-        with_context(oneAPI.synchronize, x)
-    else
-        # Do nothing, as we have received our value over a serialization
-        # boundary, which should synchronize for us
-    end
-end
-
-# Allocations
-Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(rand)) = oneAPI.rand
-Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(randn)) = oneAPI.randn
-Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(ones)) = oneAPI.ones
-Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(zeros)) = oneAPI.zeros
-struct AllocateUndef{S} end
-(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = oneArray{S,N}(undef, dims)
-Dagger.allocate_array_func(::oneArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
-
-# In-place
-# N.B. These methods assume that later operations will implicitly or
-# explicitly synchronize with their associated stream
-function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    if Dagger.root_worker_id(from_space) == myid()
-        sync_with_context(from_space)
-        with_context!(from_space)
-    end
-    copyto!(to, from)
-    # N.B. DtoH will synchronize
-    return
-end
-function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    sync_with_context(from_space)
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-
-# Out-of-place HtoD
-function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x)
-    with_context(to_proc) do
-        arr = adapt(oneArray, x)
-        oneAPI.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    cpu_data = remotecall_fetch(unwrap, from_w, x)
-    with_context(to_proc) do
-        arr = adapt(oneArray, cpu_data)
-        oneAPI.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::oneArray)
-    if oneAPI.device(x) == to_device(to_proc)
-        return x
-    end
-    with_context(to_proc) do
-        _x = similar(x)
-        copyto!(_x, x)
-        oneAPI.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoH
-function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x)
-    with_context(from_proc) do
-        oneAPI.synchronize()
-        _x = adapt(Array, x)
-        oneAPI.synchronize()
-        return _x
-    end
-end
-function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    remotecall_fetch(from_w, x) do x
-        arr = unwrap(x)
-        return Dagger.move(from_proc, to_proc, arr)
-    end
-end
-function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::oneArray{T,N}) where {T,N}
-    with_context(from_proc) do
-        oneAPI.synchronize()
-        _x = Array{T,N}(undef, size(x))
-        copyto!(_x, x)
-        oneAPI.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoD
-function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::oneArrayDeviceProc, x::Dagger.Chunk{T}) where T<:oneArray
-    if from_proc == to_proc
-        # Same process and GPU, no change
-        arr = unwrap(x)
-        with_context(oneAPI.synchronize, from_proc)
-        return arr
-    elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
-        # Same process but different GPUs, use DtoD copy
-        from_arr = unwrap(x)
-        with_context(oneAPI.synchronize, from_proc)
-        return with_context(to_proc) do
-            to_arr = similar(from_arr)
-            copyto!(to_arr, from_arr)
-            oneAPI.synchronize()
-            return to_arr
-        end
-    else
-        # Different node, use DtoH, serialization, HtoD
-        return oneArray(remotecall_fetch(from_proc.owner, x) do x
-            Array(unwrap(x))
-        end)
-    end
-end
-
-# Adapt generic functions
-Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Function) = x
-Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
-    Dagger.move(from_proc, to_proc, fetch(x))
-
-#= FIXME: Adapt BLAS/LAPACK functions
-import LinearAlgebra: BLAS, LAPACK
-for lib in [BLAS, LAPACK]
-    for name in names(lib; all=true)
-        name == nameof(lib) && continue
-        startswith(string(name), '#') && continue
-        endswith(string(name), '!') || continue
-
-        for culib in [CUBLAS, CUSOLVER]
-            if name in names(culib; all=true)
-                fn = getproperty(lib, name)
-                cufn = getproperty(culib, name)
-                @eval Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, ::$(typeof(fn))) = $cufn
-            end
-        end
-    end
-end
-=#
-
-# Task execution
-function Dagger.execute!(proc::oneArrayDeviceProc, f, args...; kwargs...)
-    @nospecialize f args kwargs
-    tls = Dagger.get_tls()
-    task = Threads.@spawn begin
-        Dagger.set_tls!(tls)
-        with_context!(proc)
-        result = Base.@invokelatest f(args...; kwargs...)
-        # N.B. Synchronization must be done when accessing result or args
-        return result
-    end
-
-    try
-        fetch(task)
-    catch err
-        stk = current_exceptions(task)
-        err, frames = stk[1]
-        rethrow(CapturedException(err, frames))
-    end
-end
-
-DaggerGPU.processor(::Val{:oneAPI}) = oneArrayDeviceProc
-DaggerGPU.cancompute(::Val{:oneAPI}) = oneAPI.functional()
-DaggerGPU.kernel_backend(::oneArrayDeviceProc) = oneAPIBackend()
-DaggerGPU.with_device(f, proc::oneArrayDeviceProc) =
-    device!(f, proc.device_id)
-
-Dagger.to_scope(::Val{:intel_gpu}, sc::NamedTuple) =
-    Dagger.to_scope(Val{:intel_gpus}(), merge(sc, (;intel_gpus=[sc.intel_gpu])))
-Dagger.scope_key_precedence(::Val{:intel_gpu}) = 1
-function Dagger.to_scope(::Val{:intel_gpus}, sc::NamedTuple)
-    if haskey(sc, :worker)
-        workers = Int[sc.worker]
-    elseif haskey(sc, :workers) && sc.workers != Colon()
-        workers = sc.workers
-    else
-        workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context()))
-    end
-    scopes = Dagger.ExactScope[]
-    dev_ids = sc.intel_gpus
-    for worker in workers
-        procs = Dagger.get_processors(Dagger.OSProc(worker))
-        for proc in procs
-            proc isa oneArrayDeviceProc || continue
-            if dev_ids == Colon() || proc.device_id in dev_ids
-                scope = Dagger.ExactScope(proc)
-                push!(scopes, scope)
-            end
-        end
-    end
-    return Dagger.UnionScope(scopes)
-end
-Dagger.scope_key_precedence(::Val{:intel_gpus}) = 1
-
-const DEVICES = Dict{Int, ZeDevice}()
-const DRIVERS = Dict{Int, ZeDriver}()
-const CONTEXTS = Dict{Int, ZeContext}()
-
-function __init__()
-    if oneAPI.functional()
-        for (device_id, dev) in enumerate(oneAPI.devices())
-            @debug "Registering Intel GPU processor with Dagger: $dev"
-            Dagger.add_processor_callback!("zearray_device_$(device_id)") do
-                proc = oneArrayDeviceProc(myid(), device_id)
-                DEVICES[device_id] = dev
-                driver!(dev.driver)
-                DRIVERS[device_id] = dev.driver
-                device!(dev)
-                ctx = ZeContext(dev.driver)
-                CONTEXTS[device_id] = ctx
-                return proc
-            end
-        end
-    end
-end
-
-end # module IntelExt
diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl
deleted file mode 100644
index e4580c6..0000000
--- a/ext/MetalExt.jl
+++ /dev/null
@@ -1,377 +0,0 @@
-module MetalExt
-
-export MtlArrayDeviceProc
-
-import Dagger, DaggerGPU, MemPool
-import Dagger: CPURAMMemorySpace, Chunk, unwrap
-import MemPool: DRef, poolget
-import Distributed: myid, remotecall_fetch
-import LinearAlgebra
-using KernelAbstractions, Adapt
-
-const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
-
-if isdefined(Base, :get_extension)
-    import Metal
-else
-    import ..Metal
-end
-import Metal: MtlArray, MetalBackend
-# FIXME: import Metal: MTLBLAS, MTLSOLVER
-const MtlDevice = Metal.MTL.MTLDeviceInstance
-const MtlStream = Metal.MTL.MTLCommandQueue
-
-struct MtlArrayDeviceProc <: Dagger.Processor
-    owner::Int
-    device_id::UInt64
-end
-
-Dagger.get_parent(proc::MtlArrayDeviceProc) = Dagger.OSProc(proc.owner)
-Dagger.root_worker_id(proc::MtlArrayDeviceProc) = proc.owner
-Dagger.short_name(proc::MtlArrayDeviceProc) = "W: $(proc.owner), Metal: $(proc.device_id)"
-DaggerGPU.@gpuproc(MtlArrayDeviceProc, MtlArray)
-
-"Represents the memory space of a single Metal GPU's VRAM."
-struct MetalVRAMMemorySpace <: Dagger.MemorySpace
-    owner::Int
-    device_id::Int
-end
-Dagger.root_worker_id(space::MetalVRAMMemorySpace) = space.owner
-function Dagger.memory_space(x::MtlArray)
-    dev = Metal.device(x)
-    device_id = _device_id(dev)
-    return MetalVRAMMemorySpace(myid(), device_id)
-end
-_device_id(dev::MtlDevice) = findfirst(other_dev->other_dev === dev, Metal.devices())
-
-Dagger.memory_spaces(proc::MtlArrayDeviceProc) = Set([MetalVRAMMemorySpace(proc.owner, proc.device_id)])
-Dagger.processors(space::MetalVRAMMemorySpace) = Set([MtlArrayDeviceProc(space.owner, space.device_id)])
-
-function to_device(proc::MtlArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return DEVICES[proc.device_id]
-end
-function to_context(proc::MtlArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return Metal.global_queue() #CONTEXTS[proc.device]
-end
-to_context(device_id::Integer) = Metal.global_queue(DEVICES[device_id])
-to_context(dev::MtlDevice) = to_context(_device_id(dev))
-
-function with_context!(device_id::Integer)
-end
-function with_context!(proc::MtlArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-end
-function with_context!(space::MetalVRAMMemorySpace)
-    @assert Dagger.root_worker_id(space) == myid()
-end
-function with_context(f, x)
-    with_context!(x)
-    return f()
-end
-
-function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
-    if Dagger.root_worker_id(x) == myid()
-        with_context(Metal.synchronize, x)
-    else
-        # Do nothing, as we have received our value over a serialization
-        # boundary, which should synchronize for us
-    end
-end
-
-# Allocations
-Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(rand)) = Metal.rand
-Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(randn)) = Metal.randn
-Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(ones)) = Metal.ones
-Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(zeros)) = Metal.zeros
-struct AllocateUndef{S} end
-(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = MtlArray{S,N}(undef, dims)
-Dagger.allocate_array_func(::MtlArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
-
-# In-place
-# N.B. These methods assume that later operations will implicitly or
-# explicitly synchronize with their associated stream
-function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::MetalVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    if Dagger.root_worker_id(from_space) == myid()
-        sync_with_context(from_space)
-        with_context!(from_space)
-    end
-    copyto!(to, from)
-    # N.B. DtoH will synchronize
-    return
-end
-function Dagger.move!(to_space::MetalVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-function Dagger.move!(to_space::MetalVRAMMemorySpace, from_space::MetalVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    sync_with_context(from_space)
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-
-# Out-of-place HtoD
-function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x)
-    with_context(to_proc) do
-        arr = adapt(MtlArray, x)
-        Metal.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    cpu_data = remotecall_fetch(unwrap, from_w, x)
-    with_context(to_proc) do
-        arr = adapt(MtlArray, cpu_data)
-        Metal.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::MtlArray)
-    if Metal.device(x) == to_device(to_proc)
-        return x
-    end
-    with_context(to_proc) do
-        _x = similar(x)
-        copyto!(_x, x)
-        Metal.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoH
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x)
-    with_context(from_proc) do
-        Metal.synchronize()
-        _x = adapt(Array, x)
-        Metal.synchronize()
-        return _x
-    end
-end
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    remotecall_fetch(from_w, x) do x
-        arr = unwrap(x)
-        return Dagger.move(from_proc, to_proc, arr)
-    end
-end
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x::MtlArray{T,N}) where {T,N}
-    with_context(from_proc) do
-        Metal.synchronize()
-        _x = Array{T,N}(undef, size(x))
-        copyto!(_x, x)
-        Metal.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoD
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::MtlArrayDeviceProc, x::Dagger.Chunk{T}) where T<:MtlArray
-    if from_proc == to_proc
-        # Same process and GPU, no change
-        arr = unwrap(x)
-        with_context(Metal.synchronize, from_proc)
-        return arr
-    # FIXME: elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
-    else
-        # Different node, use DtoH, serialization, HtoD
-        return MtlArray(remotecall_fetch(from_proc.owner, x) do x
-            Array(unwrap(x))
-        end)
-    end
-end
-
-# Adapt generic functions
-Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Function) = x
-Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
-    Dagger.move(from_proc, to_proc, fetch(x))
-
-#= FIXME: Adapt BLAS/LAPACK functions
-import LinearAlgebra: BLAS, LAPACK
-for lib in [BLAS, LAPACK]
-    for name in names(lib; all=true)
-        name == nameof(lib) && continue
-        startswith(string(name), '#') && continue
-        endswith(string(name), '!') || continue
-
-        for culib in [MTLBLAS, MTLSOLVER]
-            if name in names(culib; all=true)
-                fn = getproperty(lib, name)
-                cufn = getproperty(culib, name)
-                @eval Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, ::$(typeof(fn))) = $cufn
-            end
-        end
-    end
-end
-=#
-
-function DaggerGPU.move_optimized(
-    from_proc::CPUProc,
-    to_proc::MtlArrayDeviceProc,
-    x::Array
-)
-    # FIXME
-    return nothing
-
-    # If we have unified memory, we can try casting the `Array` to `MtlArray`.
-    device = _get_metal_device(to_proc)
-
-    if (device !== nothing) && device.hasUnifiedMemory
-        marray = _cast_array_to_mtlarray(x, device)
-        marray !== nothing && return marray
-    end
-
-    return nothing
-end
-function DaggerGPU.move_optimized(
-    from_proc::MtlArrayDeviceProc,
-    to_proc::CPUProc,
-    x::Array
-)
-    # FIXME
-    return nothing
-
-    # If we have unified memory, we can just cast the `MtlArray` to an `Array`.
-    device = _get_metal_device(from_proc)
-
-    if (device !== nothing) && device.hasUnifiedMemory
-        return unsafe_wrap(Array{T}, x, size(x))
-    end
-
-    return nothing
-end
-
-# Task execution
-function Dagger.execute!(proc::MtlArrayDeviceProc, f, args...; kwargs...)
-    @nospecialize f args kwargs
-    tls = Dagger.get_tls()
-    task = Threads.@spawn begin
-        Dagger.set_tls!(tls)
-        with_context!(proc)
-        result = Base.@invokelatest f(args...; kwargs...)
-        # N.B. Synchronization must be done when accessing result or args
-        return result
-    end
-
-    try
-        fetch(task)
-    catch err
-        stk = current_exceptions(task)
-        err, frames = stk[1]
-        rethrow(CapturedException(err, frames))
-    end
-end
-
-function Base.show(io::IO, proc::MtlArrayDeviceProc)
-    print(io, "MtlArrayDeviceProc(worker $(proc.owner), device $(something(_get_metal_device(proc)).name))")
-end
-
-DaggerGPU.processor(::Val{:Metal}) = MtlArrayDeviceProc
-DaggerGPU.cancompute(::Val{:Metal}) = Metal.functional()
-DaggerGPU.kernel_backend(proc::MtlArrayDeviceProc) = MetalBackend()
-# TODO: Switch devices
-DaggerGPU.with_device(f, proc::MtlArrayDeviceProc) = f()
-
-Dagger.to_scope(::Val{:metal_gpu}, sc::NamedTuple) =
-    Dagger.to_scope(Val{:metal_gpus}(), merge(sc, (;metal_gpus=[sc.metal_gpu])))
-Dagger.scope_key_precedence(::Val{:metal_gpu}) = 1
-function Dagger.to_scope(::Val{:metal_gpus}, sc::NamedTuple)
-    if haskey(sc, :worker)
-        workers = Int[sc.worker]
-    elseif haskey(sc, :workers) && sc.workers != Colon()
-        workers = sc.workers
-    else
-        workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context()))
-    end
-    scopes = Dagger.ExactScope[]
-    dev_ids = sc.metal_gpus
-    for worker in workers
-        procs = Dagger.get_processors(Dagger.OSProc(worker))
-        for proc in procs
-            proc isa MtlArrayDeviceProc || continue
-            if dev_ids == Colon() || proc.device_id in dev_ids
-                scope = Dagger.ExactScope(proc)
-                push!(scopes, scope)
-            end
-        end
-    end
-    return Dagger.UnionScope(scopes)
-end
-Dagger.scope_key_precedence(::Val{:metal_gpus}) = 1
-
-const DEVICES = Dict{Int, MtlDevice}()
-#const CONTEXTS = Dict{Int, MtlContext}()
-#const STREAMS = Dict{Int, MtlStream}()
-
-function __init__()
-    if Metal.functional()
-        for (dev_id, dev) in enumerate(Metal.devices())
-            @debug "Registering Metal GPU processor with Dagger: $dev"
-            # FIXME: We only get a Ptr now
-            Dagger.add_processor_callback!("metal_device_$(dev_id)") do
-                proc = MtlArrayDeviceProc(myid(), dev_id)
-                DEVICES[dev_id] = dev
-                #=
-                ctx = context(dev)
-                CONTEXTS[dev.device_id] = ctx
-                context!(ctx) do
-                    STREAMS[dev.device_id] = stream()
-                end
-                =#
-                return proc
-            end
-        end
-    end
-end
-
-
-################################################################################
-#                              Private functions
-################################################################################
-
-# Try casting the array `x` to an `MtlArray`. If the casting is not possible,
-# return `nothing`.
-function _cast_array_to_mtlarray(x::Array{T,N}, device::MtlDevice) where {T,N}
-    # Try creating the buffer without copying.
-    dims = size(x)
-    nbytes_array = prod(dims) * sizeof(T)
-    pagesize = ccall(:getpagesize, Cint, ())
-    num_pages = div(nbytes_array, pagesize, RoundUp)
-    nbytes = num_pages * pagesize
-
-    pbuf = Metal.MTL.mtDeviceNewBufferWithBytesNoCopy(
-        device,
-        pointer(x),
-        nbytes,
-        Metal.Shared | Metal.MTL.DefaultTracking | Metal.MTL.DefaultCPUCache
-    )
-
-    if pbuf != C_NULL
-        buf = MtlBuffer(pbuf)
-        marray = MtlArray{T,N}(buf, dims)
-        return marray
-    end
-
-    # If we reached here, the conversion was not possible.
-    return nothing
-end
-
-# Return the Metal device handler given the ID recorded in `proc`.
-function _get_metal_device(proc::MtlArrayDeviceProc)
-    devices = Metal.devices()
-
-    if devices === nothing
-        return nothing
-    else
-        return devices[proc.device_id]
-    end
-end
-
-end # module MetalExt
diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl
deleted file mode 100644
index 1f66589..0000000
--- a/ext/ROCExt.jl
+++ /dev/null
@@ -1,315 +0,0 @@
-module ROCExt
-
-export ROCArrayDeviceProc
-
-import Dagger, DaggerGPU, MemPool
-import Dagger: CPURAMMemorySpace, Chunk, unwrap
-import MemPool: DRef, poolget
-import Distributed: myid, remotecall_fetch
-import LinearAlgebra
-using KernelAbstractions, Adapt
-
-const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
-
-if isdefined(Base, :get_extension)
-    import AMDGPU
-else
-    import ..AMDGPU
-end
-import AMDGPU: HIPDevice, HIPContext, HIPStream, ROCArray, ROCBackend
-import AMDGPU: devices, context, context!, stream, stream!
-import AMDGPU: rocBLAS, rocSOLVER
-
-struct ROCArrayDeviceProc <: Dagger.Processor
-    owner::Int
-    device_id::Int
-end
-Dagger.get_parent(proc::ROCArrayDeviceProc) = Dagger.OSProc(proc.owner)
-Dagger.root_worker_id(proc::ROCArrayDeviceProc) = proc.owner
-Base.show(io::IO, proc::ROCArrayDeviceProc) =
-    print(io, "ROCArrayDeviceProc(worker $(proc.owner), device $(proc.device_id))")
-Dagger.short_name(proc::ROCArrayDeviceProc) = "W: $(proc.owner), ROCm: $(proc.device_id)"
-DaggerGPU.@gpuproc(ROCArrayDeviceProc, ROCArray)
-
-"Represents the memory space of a single ROCm GPU's VRAM."
-struct ROCVRAMMemorySpace <: Dagger.MemorySpace
-    owner::Int
-    device_id::Int
-end
-Dagger.root_worker_id(space::ROCVRAMMemorySpace) = space.owner
-Dagger.memory_space(x::ROCArray) =
-    ROCVRAMMemorySpace(myid(), AMDGPU.device(x).device_id)
-
-Dagger.memory_spaces(proc::ROCArrayDeviceProc) = Set([ROCVRAMMemorySpace(proc.owner, proc.device_id)])
-Dagger.processors(space::ROCVRAMMemorySpace) = Set([ROCArrayDeviceProc(space.owner, space.device_id)])
-
-function to_device(proc::ROCArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return DEVICES[proc.device_id]
-end
-function to_context(proc::ROCArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    return CONTEXTS[proc.device_id]
-end
-to_context(handle::Integer) = CONTEXTS[handle]
-to_context(dev::HIPDevice) = to_context(dev.device_id)
-
-function with_context!(handle::Integer)
-    context!(CONTEXTS[handle])
-    AMDGPU.device!(DEVICES[handle])
-    stream!(STREAMS[handle])
-end
-function with_context!(proc::ROCArrayDeviceProc)
-    @assert Dagger.root_worker_id(proc) == myid()
-    with_context!(proc.device_id)
-end
-function with_context!(space::ROCVRAMMemorySpace)
-    @assert Dagger.root_worker_id(space) == myid()
-    with_context!(space.device_id)
-end
-function with_context(f, x)
-    old_ctx = context()
-    old_device = AMDGPU.device()
-    old_stream = stream()
-
-    with_context!(x)
-    try
-        f()
-    finally
-        context!(old_ctx)
-        AMDGPU.device!(old_device)
-        stream!(old_stream)
-    end
-end
-
-function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
-    if Dagger.root_worker_id(x) == myid()
-        with_context(AMDGPU.synchronize, x)
-    else
-        # Do nothing, as we have received our value over a serialization
-        # boundary, which should synchronize for us
-    end
-end
-
-# Allocations
-# FIXME: Avoids some segfaults in rocRAND
-fake_rand(::Type{T}, dims::NTuple{N}) where {T,N} = ROCArray(rand(T, dims))
-fake_randn(::Type{T}, dims::NTuple{N}) where {T,N} = ROCArray(randn(T, dims))
-Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(rand)) = fake_rand
-Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(randn)) = fake_randn
-Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(ones)) = AMDGPU.ones
-Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(zeros)) = AMDGPU.zeros
-struct AllocateUndef{S} end
-(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = ROCArray{S,N}(undef, dims)
-Dagger.allocate_array_func(::ROCArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
-
-# In-place
-# N.B. These methods assume that later operations will implicitly or
-# explicitly synchronize with their associated stream
-function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::ROCVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    if Dagger.root_worker_id(from_space) == myid()
-        sync_with_context(from_space)
-        with_context!(from_space)
-    end
-    copyto!(to, from)
-    # N.B. DtoH will synchronize
-    return
-end
-function Dagger.move!(to_space::ROCVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-function Dagger.move!(to_space::ROCVRAMMemorySpace, from_space::ROCVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
-    sync_with_context(from_space)
-    with_context!(to_space)
-    copyto!(to, from)
-    return
-end
-
-# Out-of-place HtoD
-function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x)
-    with_context(to_proc) do
-        arr = adapt(ROCArray, x)
-        AMDGPU.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    cpu_data = remotecall_fetch(unwrap, from_w, x)
-    with_context(to_proc) do
-        arr = adapt(ROCArray, cpu_data)
-        AMDGPU.synchronize()
-        return arr
-    end
-end
-function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::ROCArray)
-    if AMDGPU.device(x) == to_device(to_proc)
-        return x
-    end
-    with_context(to_proc) do
-        _x = similar(x)
-        copyto!(_x, x)
-        AMDGPU.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoH
-function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x)
-    with_context(from_proc) do
-        AMDGPU.synchronize()
-        _x = adapt(Array, x)
-        AMDGPU.synchronize()
-        return _x
-    end
-end
-function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x::Chunk)
-    from_w = Dagger.root_worker_id(from_proc)
-    to_w = Dagger.root_worker_id(to_proc)
-    @assert myid() == to_w
-    remotecall_fetch(from_w, x) do x
-        arr = unwrap(x)
-        return Dagger.move(from_proc, to_proc, arr)
-    end
-end
-function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x::ROCArray{T,N}) where {T,N}
-    with_context(AMDGPU.device(x).device_id) do
-        AMDGPU.synchronize()
-        _x = Array{T,N}(undef, size(x))
-        copyto!(_x, x)
-        AMDGPU.synchronize()
-        return _x
-    end
-end
-
-# Out-of-place DtoD
-function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::ROCArrayDeviceProc, x::Dagger.Chunk{T}) where T<:ROCArray
-    if from_proc == to_proc
-        # Same process and GPU, no change
-        arr = unwrap(x)
-        with_context(AMDGPU.synchronize, from_proc)
-        return arr
-    elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
-        # Same process but different GPUs, use DtoD copy
-        from_arr = unwrap(x)
-        dev = AMDGPU.device(from_arr)
-        with_context(AMDGPU.synchronize, dev.device_id)
-        return with_context(to_proc) do
-            to_arr = similar(from_arr)
-            copyto!(to_arr, from_arr)
-            AMDGPU.synchronize()
-            to_arr
-        end
-    else
-        # Different node, use DtoH, serialization, HtoD
-        return ROCArray(remotecall_fetch(from_proc.owner, x) do x
-            Array(unwrap(x))
-        end)
-    end
-end
-
-# Adapt generic functions
-Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Function) = x
-Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
-    Dagger.move(from_proc, to_proc, fetch(x))
-
-# Adapt BLAS/LAPACK functions
-import LinearAlgebra: BLAS, LAPACK
-for lib in [BLAS, LAPACK]
-    for name in names(lib; all=true)
-        name == nameof(lib) && continue
-        startswith(string(name), '#') && continue
-        endswith(string(name), '!') || continue
-
-        for roclib in [rocBLAS, rocSOLVER]
-            if name in names(roclib; all=true)
-                fn = getproperty(lib, name)
-                rocfn = getproperty(roclib, name)
-                @eval Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, ::$(typeof(fn))) = $rocfn
-            end
-        end
-    end
-end
-
-# Task execution
-function Dagger.execute!(proc::ROCArrayDeviceProc, f, args...; kwargs...)
-    @nospecialize f args kwargs
-    tls = Dagger.get_tls()
-    task = Threads.@spawn begin
-        Dagger.set_tls!(tls)
-        with_context!(proc)
-        result = Base.@invokelatest f(args...; kwargs...)
-        # N.B. Synchronization must be done when accessing result or args
-        return result
-    end
-
-    try
-        fetch(task)
-    catch err
-        stk = current_exceptions(task)
-        err, frames = stk[1]
-        rethrow(CapturedException(err, frames))
-    end
-end
-
-DaggerGPU.processor(::Val{:ROC}) = ROCArrayDeviceProc
-DaggerGPU.cancompute(::Val{:ROC}) = AMDGPU.functional()
-DaggerGPU.kernel_backend(proc::ROCArrayDeviceProc) = ROCBackend()
-DaggerGPU.with_device(f, proc::ROCArrayDeviceProc) =
-    AMDGPU.device!(f, AMDGPU.devices()[proc.device_id])
-
-Dagger.to_scope(::Val{:rocm_gpu}, sc::NamedTuple) =
-    Dagger.to_scope(Val{:rocm_gpus}(), merge(sc, (;rocm_gpus=[sc.rocm_gpu])))
-function Dagger.to_scope(::Val{:rocm_gpus}, sc::NamedTuple)
-    if haskey(sc, :worker)
-        workers = Int[sc.worker]
-    elseif haskey(sc, :workers) && sc.workers != Colon()
-        workers = sc.workers
-    else
-        workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context()))
-    end
-    scopes = Dagger.ExactScope[]
-    dev_ids = sc.rocm_gpus
-    for worker in workers
-        procs = Dagger.get_processors(Dagger.OSProc(worker))
-        for proc in procs
-            proc isa ROCArrayDeviceProc || continue
-            if dev_ids == Colon() || proc.device_id in dev_ids
-                scope = Dagger.ExactScope(proc)
-                push!(scopes, scope)
-            end
-        end
-    end
-    return Dagger.UnionScope(scopes)
-end
-Dagger.scope_key_precedence(::Val{:rocm_gpu}) = 2
-Dagger.scope_key_precedence(::Val{:rocm_gpus}) = 1
-
-const DEVICES = Dict{Int, HIPDevice}()
-const CONTEXTS = Dict{Int, HIPContext}()
-const STREAMS = Dict{Int, HIPStream}()
-
-function __init__()
-    if AMDGPU.functional()
-        for device_id in 1:length(AMDGPU.devices())
-            dev = AMDGPU.devices()[device_id]
-            @debug "Registering ROCm GPU processor with Dagger: $dev"
-            Dagger.add_processor_callback!("rocarray_device_$device_id") do
-                proc = ROCArrayDeviceProc(myid(), device_id)
-                DEVICES[dev.device_id] = dev
-                ctx = HIPContext(dev)
-                CONTEXTS[dev.device_id] = ctx
-                context!(ctx) do
-                    STREAMS[dev.device_id] = HIPStream()
-                end
-                return proc
-            end
-        end
-    end
-end
-
-end # module ROCExt
diff --git a/src/DaggerGPU.jl b/src/DaggerGPU.jl
index 1d8cbce..999523a 100644
--- a/src/DaggerGPU.jl
+++ b/src/DaggerGPU.jl
@@ -1,64 +1,9 @@
 module DaggerGPU
 
-using Dagger, MemPool
-using Distributed
-using KernelAbstractions, Adapt
+import Dagger: Kernel, gpu_processor, gpu_can_compute, with_device, move_optimized, gpu_kernel_backend
 
-import Dagger: Chunk
-import LinearAlgebra
-
-const CPUProc = Union{OSProc, Dagger.ThreadProc}
-
-struct Kernel{F} end
-Kernel(f) = Kernel{f}()
-
-function (::Kernel{F})(args...; ndrange) where F
-    @nospecialize args
-    dev = kernel_backend()
-    kern = F(dev)
-    kern(args...; ndrange)
-    KernelAbstractions.synchronize(dev)
-end
-
-macro gpuproc(PROC, T)
-    PROC = esc(PROC)
-    T = esc(T)
-    quote
-        # Assume that we can run anything
-        Dagger.iscompatible_func(proc::$PROC, opts, f) = true
-        Dagger.iscompatible_arg(proc::$PROC, opts, x) = true
-
-        # CPUs shouldn't process our array type
-        Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::$T) = false
-    end
-end
-
-processor(kind::Symbol) = processor(Val(kind))
-processor(::Val) = Dagger.ThreadProc
-cancompute(kind::Symbol) = cancompute(Val(kind))
-cancompute(::Val) = false
-function with_device end
-
-move_optimized(from_proc::Dagger.Processor,
-               to_proc::Dagger.Processor,
-               x) = nothing
-
-kernel_backend() = kernel_backend(Dagger.Sch.thunk_processor())
-kernel_backend(::Dagger.ThreadProc) = CPU()
-
-using Requires
-@static if !isdefined(Base, :get_extension)
-function __init__()
-    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
-        include(joinpath(@__DIR__, "..", "ext", "CUDAExt.jl"))
-    end
-    @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
-        include(joinpath(@__DIR__, "..", "ext", "ROCExt.jl"))
-    end
-    @require Metal="dde4c033-4e86-420c-a63e-0dd931031962" begin
-        include(joinpath(@__DIR__, "..", "ext", "MetalExt.jl"))
-    end
-end
-end
+@deprecate processor gpu_processor
+@deprecate cancompute gpu_can_compute
+@deprecate kernel_backend gpu_kernel_backend
 
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 89982c4..14f598a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -105,12 +105,14 @@ end
     if !DaggerGPU.cancompute(:CUDA)
         @warn "No CUDA devices available, skipping tests"
     else
+        #=
         cuproc = if isdefined(Base, :get_extension)
             Base.get_extension(DaggerGPU, :CUDAExt).CuArrayDeviceProc
         else
             CuArrayDeviceProc
         end
         @test DaggerGPU.processor(:CUDA) === cuproc
+        =#
         ndevices = length(collect(CUDA.devices()))
         gpu_configs = Any[1]
         if ndevices > 1
@@ -222,12 +224,14 @@ end
     if !DaggerGPU.cancompute(:ROC)
         @warn "No ROCm devices available, skipping tests"
     else
+        #=
         rocproc = if isdefined(Base, :get_extension)
             Base.get_extension(DaggerGPU, :ROCExt).ROCArrayDeviceProc
         else
             ROCArrayDeviceProc
         end
         @test DaggerGPU.processor(:ROC) === rocproc
+        =#
         ndevices = length(AMDGPU.devices())
         gpu_configs = Any[1]
         if ndevices > 1
@@ -339,12 +343,14 @@ end
     if !DaggerGPU.cancompute(:oneAPI)
         @warn "No oneAPI devices available, skipping tests"
     else
+        #=
         oneproc = if isdefined(Base, :get_extension)
             Base.get_extension(DaggerGPU, :IntelExt).oneArrayDeviceProc
         else
             oneArrayDeviceProc
         end
         @test DaggerGPU.processor(:oneAPI) === oneproc
+        =#
         ndevices = length(oneAPI.devices())
         gpu_configs = Any[1]
         if ndevices > 1
@@ -460,12 +466,14 @@ end
     if !DaggerGPU.cancompute(:Metal)
         @warn "No Metal devices available, skipping tests"
     else
+        #=
         mtlproc = if isdefined(Base, :get_extension)
             Base.get_extension(DaggerGPU, :MetalExt).MtlArrayDeviceProc
         else
             MtlArrayDeviceProc
         end
         @test DaggerGPU.processor(:Metal) === mtlproc
+        =#
         b = generate_thunks()
         c = Dagger.with_options(;scope=Dagger.scope(metal_gpu=1)) do
             @test fetch(Dagger.@spawn isongpu(b))