JuliaGPU
diff --git a/‎Project.toml‎
Lines changed: 20 additions & 4 deletions b/‎Project.toml‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎src/cu.jl‎ ‎ext/CUDAExt.jl‎src/cu.jl renamed to ext/CUDAExt.jl
Lines changed: 52 additions & 26 deletions b/‎src/cu.jl‎ ‎ext/CUDAExt.jl‎src/cu.jl renamed to ext/CUDAExt.jl
Lines changed: 52 additions & 26 deletions
diff --git a/‎src/metal.jl‎ ‎ext/MetalExt.jl‎src/metal.jl renamed to ext/MetalExt.jl
Lines changed: 64 additions & 61 deletions b/‎src/metal.jl‎ ‎ext/MetalExt.jl‎src/metal.jl renamed to ext/MetalExt.jl
Lines changed: 64 additions & 61 deletions
@@ -1,21 +1,37 @@
 name = "DaggerGPU"
 uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
+[extensions]
+CUDAExt = "CUDA"
+MetalExt = "Metal"
+ROCExt = "AMDGPU"
+
 [compat]
+AMDGPU = "0.4"
 Adapt = "1, 2, 3"
-Dagger = "0.13.3, 0.14, 0.15, 0.16"
-KernelAbstractions = "0.5, 0.6, 0.7, 0.8"
+CUDA = "3, 4"
+Dagger = "0.17"
+KernelAbstractions = "0.9"
 MemPool = "0.3, 0.4"
+Metal = "0.3, 0.4"
 Requires = "1"
-julia = "1.6"
+julia = "1.7"
@@ -1,17 +1,28 @@
-using .CUDA
-import .CUDA: CuDevice, CuContext, devices, attribute
-
-using UUIDs
+module CUDAExt
 
 export CuArrayDeviceProc
 
+import Dagger, DaggerGPU, MemPool
+import Distributed: myid, remotecall_fetch
+
+const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
+
+if isdefined(Base, :get_extension)
+    import CUDA
+else
+    import ..CUDA
+end
+import CUDA: CuDevice, CuContext, CuArray, CUDABackend, devices, attribute
+
+using UUIDs
+
 "Represents a single CUDA GPU device."
 struct CuArrayDeviceProc <: Dagger.Processor
     owner::Int
     device::Int
     device_uuid::UUID
 end
-@gpuproc(CuArrayDeviceProc, CuArray)
+DaggerGPU.@gpuproc(CuArrayDeviceProc, CuArray)
 Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner)
 
 # function can_access(this, peer)
@@ -23,10 +34,10 @@ Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner)
 function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray
     if from == to
         # Same process and GPU, no change
-        poolget(x.handle)
+        MemPool.poolget(x.handle)
     elseif from.owner == to.owner
         # Same process but different GPUs, use DtoD copy
-        from_arr = poolget(x.handle)
+        from_arr = MemPool.poolget(x.handle)
         to_arr = CUDA.device!(to.device) do
             CuArray{T,N}(undef, size)
         end
@@ -35,7 +46,7 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.C
     elseif Dagger.system_uuid(from.owner) == Dagger.system_uuid(to.owner)
         # Same node, we can use IPC
         ipc_handle, eT, shape = remotecall_fetch(from.owner, x.handle) do h
-            arr = poolget(h)
+            arr = MemPool.poolget(h)
             ipc_handle_ref = Ref{CUDA.CUipcMemHandle}()
             GC.@preserve arr begin
                 CUDA.cuIpcGetMemHandle(ipc_handle_ref, pointer(arr))
@@ -64,41 +75,56 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x::Dagger.C
         # Different node, use DtoH, serialization, HtoD
         # TODO UCX
         CuArray(remotecall_fetch(from.owner, x.handle) do h
-            Array(poolget(h))
+            Array(MemPool.poolget(h))
         end)
     end
 end
 
-function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
+function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...)
+    @nospecialize f args kwargs
     tls = Dagger.get_tls()
     task = Threads.@spawn begin
         Dagger.set_tls!(tls)
         CUDA.device!(proc.device)
-        CUDA.@sync func(args...)
+        result = Base.@invokelatest f(args...; kwargs...)
+        CUDA.synchronize()
+        return result
     end
+
     try
         fetch(task)
     catch err
-        @static if VERSION >= v"1.1"
-            stk = Base.catch_stack(task)
-            err, frames = stk[1]
-            rethrow(CapturedException(err, frames))
-        else
-            rethrow(task.result)
-        end
+        stk = current_exceptions(task)
+        err, frames = stk[1]
+        rethrow(CapturedException(err, frames))
     end
 end
 Base.show(io::IO, proc::CuArrayDeviceProc) =
-    print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid)")
+    print(io, "CuArrayDeviceProc(worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid))")
 
-processor(::Val{:CUDA}) = CuArrayDeviceProc
-cancompute(::Val{:CUDA}) = CUDA.has_cuda()
-kernel_backend(::CuArrayDeviceProc) = CUDADevice()
+DaggerGPU.processor(::Val{:CUDA}) = CuArrayDeviceProc
+DaggerGPU.cancompute(::Val{:CUDA}) = CUDA.has_cuda()
+DaggerGPU.kernel_backend(::CuArrayDeviceProc) = CUDABackend()
+DaggerGPU.with_device(f, proc::CuArrayDeviceProc) =
+    CUDA.device!(f, proc.device)
 
-if CUDA.has_cuda()
-    for dev in devices()
-        Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do
-            CuArrayDeviceProc(Distributed.myid(), dev.handle, CUDA.uuid(dev))
+function Dagger.to_scope(::Val{:cuda_gpu}, sc::NamedTuple)
+    worker = get(sc, :worker, 1)
+    dev_id = sc.cuda_gpu
+    dev = collect(CUDA.devices())[dev_id]
+    return Dagger.ExactScope(CuArrayDeviceProc(worker, dev_id-1, CUDA.uuid(dev)))
+end
+Dagger.scope_key_precedence(::Val{:cuda_gpu}) = 1
+
+function __init__()
+    if CUDA.has_cuda()
+        for dev in CUDA.devices()
+            @debug "Registering CUDA GPU processor with Dagger: $dev"
+            Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do
+                CuArrayDeviceProc(myid(), dev.handle, CUDA.uuid(dev))
+            end
         end
     end
 end
+
+end # module CUDAExt
@@ -1,42 +1,36 @@
-using .Metal
-import .Metal: MtlArray, MtlDevice
+module MetalExt
 
-struct MtlArrayDeviceProc <: Dagger.Processor
-    owner::Int
-    device_id::UInt64
-end
+export MtlArrayDeviceProc
 
-# Assume that we can run anything.
-Dagger.iscompatible_func(proc::MtlArrayDeviceProc, opts, f) = true
-Dagger.iscompatible_arg(proc::MtlArrayDeviceProc, opts, x) = true
+import Dagger, DaggerGPU
+import Distributed: myid
 
-# CPUs shouldn't process our array type.
-Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::MtlArray) = false
+const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
 
-function Dagger.move(from_proc::OSProc, to_proc::MtlArrayDeviceProc, x::Chunk)
-    from_pid = from_proc.pid
-    to_pid = Dagger.get_parent(to_proc).pid
-    @assert myid() == to_pid
-
-    return Dagger.move(from_proc, to_proc, remotecall_fetch(x->poolget(x.handle), from_pid, x))
+if isdefined(Base, :get_extension)
+    import Metal
+else
+    import ..Metal
 end
+import Metal: MtlArray, MetalBackend
+const MtlDevice = Metal.MTL.MTLDeviceInstance
 
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::OSProc, x::Chunk)
-    from_pid = Dagger.get_parent(from_proc).pid
-    to_pid = to_proc.pid
-    @assert myid() == to_pid
-
-    return remotecall_fetch(from_pid, x) do x
-        mtlarray = poolget(x.handle)
-        return Dagger.move(from_proc, to_proc, mtlarray)
-    end
+struct MtlArrayDeviceProc <: Dagger.Processor
+    owner::Int
+    device_id::UInt64
 end
 
-function Dagger.move(
-    from_proc::OSProc,
+DaggerGPU.@gpuproc(MtlArrayDeviceProc, MtlArray)
+Dagger.get_parent(proc::MtlArrayDeviceProc) = Dagger.OSProc(proc.owner)
+
+function DaggerGPU.move_optimized(
+    from_proc::CPUProc,
     to_proc::MtlArrayDeviceProc,
-    x::Array{T, N}
-) where {T, N}
+    x::Array
+)
+    # FIXME
+    return nothing
+
     # If we have unified memory, we can try casting the `Array` to `MtlArray`.
     device = _get_metal_device(to_proc)
 
@@ -45,68 +39,75 @@ function Dagger.move(
         marray !== nothing && return marray
     end
 
-    return adapt(MtlArray, x)
+    return nothing
 end
 
-function Dagger.move(from_proc::OSProc, to_proc::MtlArrayDeviceProc, x)
-    adapt(MtlArray, x)
-end
 
-function Dagger.move(
+function DaggerGPU.move_optimized(
     from_proc::MtlArrayDeviceProc,
-    to_proc::OSProc,
-    x::Array{T, N}
-) where {T, N}
+    to_proc::CPUProc,
+    x::Array
+)
+    # FIXME
+    return nothing
+
     # If we have unified memory, we can just cast the `MtlArray` to an `Array`.
     device = _get_metal_device(from_proc)
 
     if (device !== nothing) && device.hasUnifiedMemory
         return unsafe_wrap(Array{T}, x, size(x))
-    else
-        return adapt(Array, x)
     end
-end
 
-function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::OSProc, x)
-    adapt(Array, x)
+    return nothing
 end
 
-Dagger.get_parent(proc::MtlArrayDeviceProc) = Dagger.OSProc(proc.owner)
-
-function Dagger.execute!(proc::MtlArrayDeviceProc, func, args...)
+function Dagger.execute!(proc::MtlArrayDeviceProc, f, args...; kwargs...)
+    @nospecialize f args kwargs
     tls = Dagger.get_tls()
     task = Threads.@spawn begin
         Dagger.set_tls!(tls)
-        Metal.@sync func(args...)
+        result = Base.@invokelatest f(args...; kwargs...)
+        Metal.synchronize()
+        return result
     end
 
     try
         fetch(task)
     catch err
-        @static if VERSION >= v"1.1"
-            stk = Base.catch_stack(task)
-            err, frames = stk[1]
-            rethrow(CapturedException(err, frames))
-        else
-            rethrow(task.result)
-        end
+        stk = current_exceptions(task)
+        err, frames = stk[1]
+        rethrow(CapturedException(err, frames))
     end
 end
 
 function Base.show(io::IO, proc::MtlArrayDeviceProc)
-    print(io, "MtlArrayDeviceProc on worker $(proc.owner), device ($(something(_get_metal_device(proc)).name))")
+    print(io, "MtlArrayDeviceProc(worker $(proc.owner), device $(something(_get_metal_device(proc)).name))")
 end
 
-processor(::Val{:Metal}) = MtlArrayDeviceProc
-cancompute(::Val{:Metal}) = length(Metal.devices()) >= 1
-kernel_backend(proc::MtlArrayDeviceProc) = _get_metal_device(proc)
+DaggerGPU.processor(::Val{:Metal}) = MtlArrayDeviceProc
+DaggerGPU.cancompute(::Val{:Metal}) = Metal.functional()
+DaggerGPU.kernel_backend(proc::MtlArrayDeviceProc) = MetalBackend()
+# TODO: Switch devices
+DaggerGPU.with_device(f, proc::MtlArrayDeviceProc) = f()
+
+function Dagger.to_scope(::Val{:metal_gpu}, sc::NamedTuple)
+    worker = get(sc, :worker, 1)
+    dev_id = sc.metal_gpu
+    dev = Metal.devices()[dev_id]
+    return Dagger.ExactScope(MtlArrayDeviceProc(worker, dev.registryID))
+end
+Dagger.scope_key_precedence(::Val{:metal_gpu}) = 1
 
-for dev in Metal.devices()
-    Dagger.add_processor_callback!("metal_device_$(dev.registryID)") do
-        MtlArrayDeviceProc(Distributed.myid(), dev.registryID)
+function __init__()
+    for dev in Metal.devices()
+        @debug "Registering Metal GPU processor with Dagger: $dev"
+        Dagger.add_processor_callback!("metal_device_$(dev.registryID)") do
+            MtlArrayDeviceProc(myid(), dev.registryID)
+        end
     end
 end
 
+
 ################################################################################
 #                              Private functions
 ################################################################################
@@ -149,3 +150,5 @@ function _get_metal_device(proc::MtlArrayDeviceProc)
         return devices[id]
     end
 end
+
+end # module MetalExt