diff --git a/Project.toml b/Project.toml index 3ea1895..6ab5499 100644 --- a/Project.toml +++ b/Project.toml @@ -1,42 +1,22 @@ name = "DaggerGPU" uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2" authors = ["Julian P Samaroo "] -version = "0.2.0" +version = "0.2.1" [deps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" -Metal = "dde4c033-4e86-420c-a63e-0dd931031962" -oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" Requires = "ae029012-a4dd-5104-9daa-d747884805df" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -[weakdeps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -Metal = "dde4c033-4e86-420c-a63e-0dd931031962" -oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" - -[extensions] -CUDAExt = "CUDA" -IntelExt = "oneAPI" -MetalExt = "Metal" -ROCExt = "AMDGPU" - [compat] -AMDGPU = "0.9.4" Adapt = "1, 2, 3, 4" -CUDA = "3, 4, 5" -Dagger = "0.18.12" +Dagger = "0.18.17" KernelAbstractions = "0.9" MemPool = "0.3, 0.4" -Metal = "1.1" -oneAPI = "1" Requires = "1" julia = "1.7" diff --git a/README.md b/README.md index 5922938..51e04d6 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,12 @@ **GPU integrations for Dagger.jl** +## Deprecation Notice + +DaggerGPU's logic and extensions have been merged upstream into Dagger.jl, making DaggerGPU no longer necessary. You can now load CUDA logic with `using Dagger, CUDA`, and similarly for all other backends. Please report all issues to the [Dagger issue tracker](https://github.com/JuliaParallel/Dagger.jl/issues). + +## Original README + DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA, AMD, and Apple GPUs, via CUDA.jl, AMDGPU.jl, and Metal.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl/Metal.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc`/`DaggerGPU.MtlArrayDeviceProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this). DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide: diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl deleted file mode 100644 index 400b1dd..0000000 --- a/ext/CUDAExt.jl +++ /dev/null @@ -1,344 +0,0 @@ -module CUDAExt - -export CuArrayDeviceProc - -import Dagger, DaggerGPU, MemPool -import Dagger: CPURAMMemorySpace, Chunk, unwrap -import MemPool: DRef, poolget -import Distributed: myid, remotecall_fetch -import LinearAlgebra -using KernelAbstractions, Adapt - -const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc} - -if isdefined(Base, :get_extension) - import CUDA -else - import ..CUDA -end -import CUDA: CuDevice, CuContext, CuStream, CuArray, CUDABackend -import CUDA: devices, attribute, context, context!, stream, stream! -import CUDA: CUBLAS, CUSOLVER - -using UUIDs - -"Represents a single CUDA GPU device." -struct CuArrayDeviceProc <: Dagger.Processor - owner::Int - device::Int - device_uuid::UUID -end -Dagger.get_parent(proc::CuArrayDeviceProc) = Dagger.OSProc(proc.owner) -Dagger.root_worker_id(proc::CuArrayDeviceProc) = proc.owner -Base.show(io::IO, proc::CuArrayDeviceProc) = - print(io, "CuArrayDeviceProc(worker $(proc.owner), device $(proc.device), uuid $(proc.device_uuid))") -Dagger.short_name(proc::CuArrayDeviceProc) = "W: $(proc.owner), CUDA: $(proc.device)" -DaggerGPU.@gpuproc(CuArrayDeviceProc, CuArray) - -"Represents the memory space of a single CUDA GPU's VRAM." -struct CUDAVRAMMemorySpace <: Dagger.MemorySpace - owner::Int - device::Int - device_uuid::UUID -end -Dagger.root_worker_id(space::CUDAVRAMMemorySpace) = space.owner -function Dagger.memory_space(x::CuArray) - dev = CUDA.device(x) - device_id = dev.handle - device_uuid = CUDA.uuid(dev) - return CUDAVRAMMemorySpace(myid(), device_id, device_uuid) -end - -Dagger.memory_spaces(proc::CuArrayDeviceProc) = Set([CUDAVRAMMemorySpace(proc.owner, proc.device, proc.device_uuid)]) -Dagger.processors(space::CUDAVRAMMemorySpace) = Set([CuArrayDeviceProc(space.owner, space.device, space.device_uuid)]) - -function to_device(proc::CuArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return DEVICES[proc.device] -end -function to_context(proc::CuArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return CONTEXTS[proc.device] -end -to_context(handle::Integer) = CONTEXTS[handle] -to_context(dev::CuDevice) = to_context(dev.handle) - -function with_context!(handle::Integer) - context!(CONTEXTS[handle]) - stream!(STREAMS[handle]) -end -function with_context!(proc::CuArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - with_context!(proc.device) -end -function with_context!(space::CUDAVRAMMemorySpace) - @assert Dagger.root_worker_id(space) == myid() - with_context!(space.device) -end -function with_context(f, x) - old_ctx = context() - old_stream = stream() - - with_context!(x) - try - f() - finally - context!(old_ctx) - stream!(old_stream) - end -end - -function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) - if Dagger.root_worker_id(x) == myid() - with_context(CUDA.synchronize, x) - else - # Do nothing, as we have received our value over a serialization - # boundary, which should synchronize for us - end -end - -# Allocations -Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(rand)) = CUDA.rand -Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(randn)) = CUDA.randn -Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(ones)) = CUDA.ones -Dagger.allocate_array_func(::CuArrayDeviceProc, ::typeof(zeros)) = CUDA.zeros -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = CuArray{S,N}(undef, dims) -Dagger.allocate_array_func(::CuArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() - -# In-place -# N.B. These methods assume that later operations will implicitly or -# explicitly synchronize with their associated stream -function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::CUDAVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - if Dagger.root_worker_id(from_space) == myid() - sync_with_context(from_space) - with_context!(from_space) - end - copyto!(to, from) - # N.B. DtoH will synchronize - return -end -function Dagger.move!(to_space::CUDAVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - with_context!(to_space) - copyto!(to, from) - return -end -function Dagger.move!(to_space::CUDAVRAMMemorySpace, from_space::CUDAVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - sync_with_context(from_space) - with_context!(to_space) - copyto!(to, from) - return -end - -# Out-of-place HtoD -function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x) - with_context(to_proc) do - arr = adapt(CuArray, x) - CUDA.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - cpu_data = remotecall_fetch(unwrap, from_w, x) - with_context(to_proc) do - arr = adapt(CuArray, cpu_data) - CUDA.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::CuArray) - if CUDA.device(x) == to_device(to_proc) - return x - end - with_context(to_proc) do - _x = similar(x) - copyto!(_x, x) - CUDA.synchronize() - return _x - end -end - -# Out-of-place DtoH -function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x) - with_context(from_proc) do - CUDA.synchronize() - _x = adapt(Array, x) - CUDA.synchronize() - return _x - end -end -function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - remotecall_fetch(from_w, x) do x - arr = unwrap(x) - return Dagger.move(from_proc, to_proc, arr) - end -end -function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CPUProc, x::CuArray{T,N}) where {T,N} - with_context(from_proc) do - CUDA.synchronize() - _x = Array{T,N}(undef, size(x)) - copyto!(_x, x) - CUDA.synchronize() - return _x - end -end - -# Out-of-place DtoD -function Dagger.move(from_proc::CuArrayDeviceProc, to_proc::CuArrayDeviceProc, x::Dagger.Chunk{T}) where T<:CuArray - if from_proc == to_proc - # Same process and GPU, no change - arr = unwrap(x) - with_context(CUDA.synchronize, from_proc) - return arr - elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc) - # Same process but different GPUs, use DtoD copy - from_arr = unwrap(x) - with_context(CUDA.synchronize, from_proc) - return with_context(to_proc) do - to_arr = similar(from_arr) - copyto!(to_arr, from_arr) - CUDA.synchronize() - return to_arr - end - elseif Dagger.system_uuid(from_proc.owner) == Dagger.system_uuid(to_proc.owner) && from_proc.device_uuid == to_proc.device_uuid - # Same node, we can use IPC - ipc_handle, eT, shape = remotecall_fetch(from_proc.owner, x) do x - arr = unwrap(x) - ipc_handle_ref = Ref{CUDA.CUipcMemHandle}() - GC.@preserve arr begin - CUDA.cuIpcGetMemHandle(ipc_handle_ref, pointer(arr)) - end - (ipc_handle_ref[], eltype(arr), size(arr)) - end - r_ptr = Ref{CUDA.CUdeviceptr}() - CUDA.device!(from_proc.device) do - CUDA.cuIpcOpenMemHandle(r_ptr, ipc_handle, CUDA.CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) - end - ptr = Base.unsafe_convert(CUDA.CuPtr{eT}, r_ptr[]) - arr = unsafe_wrap(CuArray, ptr, shape; own=false) - finalizer(arr) do arr - CUDA.cuIpcCloseMemHandle(pointer(arr)) - end - if from_proc.device_uuid != to_proc.device_uuid - return CUDA.device!(to_proc.device) do - to_arr = similar(arr) - copyto!(to_arr, arr) - to_arr - end - else - return arr - end - else - # Different node, use DtoH, serialization, HtoD - return CuArray(remotecall_fetch(from_proc.owner, x) do x - Array(unwrap(x)) - end) - end -end - -# Adapt generic functions -Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Function) = x -Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, x::Chunk{T}) where {T<:Function} = - Dagger.move(from_proc, to_proc, fetch(x)) - -# Adapt BLAS/LAPACK functions -import LinearAlgebra: BLAS, LAPACK -for lib in [BLAS, LAPACK] - for name in names(lib; all=true) - name == nameof(lib) && continue - startswith(string(name), '#') && continue - endswith(string(name), '!') || continue - - for culib in [CUBLAS, CUSOLVER] - if name in names(culib; all=true) - fn = getproperty(lib, name) - cufn = getproperty(culib, name) - @eval Dagger.move(from_proc::CPUProc, to_proc::CuArrayDeviceProc, ::$(typeof(fn))) = $cufn - end - end - end -end - -# Task execution -function Dagger.execute!(proc::CuArrayDeviceProc, f, args...; kwargs...) - @nospecialize f args kwargs - tls = Dagger.get_tls() - task = Threads.@spawn begin - Dagger.set_tls!(tls) - with_context!(proc) - result = Base.@invokelatest f(args...; kwargs...) - # N.B. Synchronization must be done when accessing result or args - return result - end - - try - fetch(task) - catch err - stk = current_exceptions(task) - err, frames = stk[1] - rethrow(CapturedException(err, frames)) - end -end - -DaggerGPU.processor(::Val{:CUDA}) = CuArrayDeviceProc -DaggerGPU.cancompute(::Val{:CUDA}) = CUDA.has_cuda() -DaggerGPU.kernel_backend(::CuArrayDeviceProc) = CUDABackend() -DaggerGPU.with_device(f, proc::CuArrayDeviceProc) = - CUDA.device!(f, proc.device) - -Dagger.to_scope(::Val{:cuda_gpu}, sc::NamedTuple) = - Dagger.to_scope(Val{:cuda_gpus}(), merge(sc, (;cuda_gpus=[sc.cuda_gpu]))) -Dagger.scope_key_precedence(::Val{:cuda_gpu}) = 1 -function Dagger.to_scope(::Val{:cuda_gpus}, sc::NamedTuple) - if haskey(sc, :worker) - workers = Int[sc.worker] - elseif haskey(sc, :workers) && sc.workers != Colon() - workers = sc.workers - else - workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context())) - end - scopes = Dagger.ExactScope[] - dev_ids = sc.cuda_gpus - for worker in workers - procs = Dagger.get_processors(Dagger.OSProc(worker)) - for proc in procs - proc isa CuArrayDeviceProc || continue - if dev_ids == Colon() || proc.device+1 in dev_ids - scope = Dagger.ExactScope(proc) - push!(scopes, scope) - end - end - end - return Dagger.UnionScope(scopes) -end -Dagger.scope_key_precedence(::Val{:cuda_gpus}) = 1 - -const DEVICES = Dict{Int, CuDevice}() -const CONTEXTS = Dict{Int, CuContext}() -const STREAMS = Dict{Int, CuStream}() - -function __init__() - if CUDA.has_cuda() - for dev in CUDA.devices() - @debug "Registering CUDA GPU processor with Dagger: $dev" - Dagger.add_processor_callback!("cuarray_device_$(dev.handle)") do - proc = CuArrayDeviceProc(myid(), dev.handle, CUDA.uuid(dev)) - DEVICES[dev.handle] = dev - ctx = context(dev) - CONTEXTS[dev.handle] = ctx - context!(ctx) do - STREAMS[dev.handle] = stream() - end - return proc - end - end - end -end - -end # module CUDAExt diff --git a/ext/IntelExt.jl b/ext/IntelExt.jl deleted file mode 100644 index 74f3aa3..0000000 --- a/ext/IntelExt.jl +++ /dev/null @@ -1,312 +0,0 @@ -module IntelExt - -export oneArrayDeviceProc - -import Dagger, DaggerGPU, MemPool -import Dagger: CPURAMMemorySpace, Chunk, unwrap -import MemPool: DRef, poolget -import Distributed: myid, remotecall_fetch -import LinearAlgebra -using KernelAbstractions, Adapt - -const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc} - -if isdefined(Base, :get_extension) - import oneAPI -else - import ..oneAPI -end -import oneAPI: ZeDevice, ZeDriver, ZeContext, oneArray, oneAPIBackend -import oneAPI: driver, driver!, device, device!, context, context! -#import oneAPI: CUBLAS, CUSOLVER - -using UUIDs - -"Represents a single Intel GPU device." -struct oneArrayDeviceProc <: Dagger.Processor - owner::Int - device_id::Int -end -Dagger.get_parent(proc::oneArrayDeviceProc) = Dagger.OSProc(proc.owner) -Dagger.root_worker_id(proc::oneArrayDeviceProc) = proc.owner -Base.show(io::IO, proc::oneArrayDeviceProc) = - print(io, "oneArrayDeviceProc(worker $(proc.owner), device $(proc.device_id))") -Dagger.short_name(proc::oneArrayDeviceProc) = "W: $(proc.owner), oneAPI: $(proc.device)" -DaggerGPU.@gpuproc(oneArrayDeviceProc, oneArray) - -"Represents the memory space of a single Intel GPU's VRAM." -struct IntelVRAMMemorySpace <: Dagger.MemorySpace - owner::Int - device_id::Int -end -Dagger.root_worker_id(space::IntelVRAMMemorySpace) = space.owner -function Dagger.memory_space(x::oneArray) - dev = oneAPI.device(x) - device_id = _device_id(dev) - return IntelVRAMMemorySpace(myid(), device_id) -end -_device_id(dev::ZeDevice) = findfirst(other_dev->other_dev === dev, collect(oneAPI.devices())) - -Dagger.memory_spaces(proc::oneArrayDeviceProc) = Set([IntelVRAMMemorySpace(proc.owner, proc.device_id)]) -Dagger.processors(space::IntelVRAMMemorySpace) = Set([oneArrayDeviceProc(space.owner, space.device_id)]) - -function to_device(proc::oneArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return DEVICES[proc.device_id] -end - -function with_context!(device_id::Integer) - driver!(DRIVERS[device_id]) - device!(DEVICES[device_id]) - context!(CONTEXTS[device_id]) -end -function with_context!(proc::oneArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - with_context!(proc.device_id) -end -function with_context!(space::IntelVRAMMemorySpace) - @assert Dagger.root_worker_id(space) == myid() - with_context!(space.device_id) -end -function with_context(f, x) - old_drv = driver() - old_dev = device() - old_ctx = context() - - with_context!(x) - try - f() - finally - driver!(old_drv) - device!(old_dev) - context!(old_ctx) - end -end - -function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) - if Dagger.root_worker_id(x) == myid() - with_context(oneAPI.synchronize, x) - else - # Do nothing, as we have received our value over a serialization - # boundary, which should synchronize for us - end -end - -# Allocations -Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(rand)) = oneAPI.rand -Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(randn)) = oneAPI.randn -Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(ones)) = oneAPI.ones -Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(zeros)) = oneAPI.zeros -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = oneArray{S,N}(undef, dims) -Dagger.allocate_array_func(::oneArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() - -# In-place -# N.B. These methods assume that later operations will implicitly or -# explicitly synchronize with their associated stream -function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - if Dagger.root_worker_id(from_space) == myid() - sync_with_context(from_space) - with_context!(from_space) - end - copyto!(to, from) - # N.B. DtoH will synchronize - return -end -function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - with_context!(to_space) - copyto!(to, from) - return -end -function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - sync_with_context(from_space) - with_context!(to_space) - copyto!(to, from) - return -end - -# Out-of-place HtoD -function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x) - with_context(to_proc) do - arr = adapt(oneArray, x) - oneAPI.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - cpu_data = remotecall_fetch(unwrap, from_w, x) - with_context(to_proc) do - arr = adapt(oneArray, cpu_data) - oneAPI.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::oneArray) - if oneAPI.device(x) == to_device(to_proc) - return x - end - with_context(to_proc) do - _x = similar(x) - copyto!(_x, x) - oneAPI.synchronize() - return _x - end -end - -# Out-of-place DtoH -function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x) - with_context(from_proc) do - oneAPI.synchronize() - _x = adapt(Array, x) - oneAPI.synchronize() - return _x - end -end -function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - remotecall_fetch(from_w, x) do x - arr = unwrap(x) - return Dagger.move(from_proc, to_proc, arr) - end -end -function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::oneArray{T,N}) where {T,N} - with_context(from_proc) do - oneAPI.synchronize() - _x = Array{T,N}(undef, size(x)) - copyto!(_x, x) - oneAPI.synchronize() - return _x - end -end - -# Out-of-place DtoD -function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::oneArrayDeviceProc, x::Dagger.Chunk{T}) where T<:oneArray - if from_proc == to_proc - # Same process and GPU, no change - arr = unwrap(x) - with_context(oneAPI.synchronize, from_proc) - return arr - elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc) - # Same process but different GPUs, use DtoD copy - from_arr = unwrap(x) - with_context(oneAPI.synchronize, from_proc) - return with_context(to_proc) do - to_arr = similar(from_arr) - copyto!(to_arr, from_arr) - oneAPI.synchronize() - return to_arr - end - else - # Different node, use DtoH, serialization, HtoD - return oneArray(remotecall_fetch(from_proc.owner, x) do x - Array(unwrap(x)) - end) - end -end - -# Adapt generic functions -Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Function) = x -Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk{T}) where {T<:Function} = - Dagger.move(from_proc, to_proc, fetch(x)) - -#= FIXME: Adapt BLAS/LAPACK functions -import LinearAlgebra: BLAS, LAPACK -for lib in [BLAS, LAPACK] - for name in names(lib; all=true) - name == nameof(lib) && continue - startswith(string(name), '#') && continue - endswith(string(name), '!') || continue - - for culib in [CUBLAS, CUSOLVER] - if name in names(culib; all=true) - fn = getproperty(lib, name) - cufn = getproperty(culib, name) - @eval Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, ::$(typeof(fn))) = $cufn - end - end - end -end -=# - -# Task execution -function Dagger.execute!(proc::oneArrayDeviceProc, f, args...; kwargs...) - @nospecialize f args kwargs - tls = Dagger.get_tls() - task = Threads.@spawn begin - Dagger.set_tls!(tls) - with_context!(proc) - result = Base.@invokelatest f(args...; kwargs...) - # N.B. Synchronization must be done when accessing result or args - return result - end - - try - fetch(task) - catch err - stk = current_exceptions(task) - err, frames = stk[1] - rethrow(CapturedException(err, frames)) - end -end - -DaggerGPU.processor(::Val{:oneAPI}) = oneArrayDeviceProc -DaggerGPU.cancompute(::Val{:oneAPI}) = oneAPI.functional() -DaggerGPU.kernel_backend(::oneArrayDeviceProc) = oneAPIBackend() -DaggerGPU.with_device(f, proc::oneArrayDeviceProc) = - device!(f, proc.device_id) - -Dagger.to_scope(::Val{:intel_gpu}, sc::NamedTuple) = - Dagger.to_scope(Val{:intel_gpus}(), merge(sc, (;intel_gpus=[sc.intel_gpu]))) -Dagger.scope_key_precedence(::Val{:intel_gpu}) = 1 -function Dagger.to_scope(::Val{:intel_gpus}, sc::NamedTuple) - if haskey(sc, :worker) - workers = Int[sc.worker] - elseif haskey(sc, :workers) && sc.workers != Colon() - workers = sc.workers - else - workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context())) - end - scopes = Dagger.ExactScope[] - dev_ids = sc.intel_gpus - for worker in workers - procs = Dagger.get_processors(Dagger.OSProc(worker)) - for proc in procs - proc isa oneArrayDeviceProc || continue - if dev_ids == Colon() || proc.device_id in dev_ids - scope = Dagger.ExactScope(proc) - push!(scopes, scope) - end - end - end - return Dagger.UnionScope(scopes) -end -Dagger.scope_key_precedence(::Val{:intel_gpus}) = 1 - -const DEVICES = Dict{Int, ZeDevice}() -const DRIVERS = Dict{Int, ZeDriver}() -const CONTEXTS = Dict{Int, ZeContext}() - -function __init__() - if oneAPI.functional() - for (device_id, dev) in enumerate(oneAPI.devices()) - @debug "Registering Intel GPU processor with Dagger: $dev" - Dagger.add_processor_callback!("zearray_device_$(device_id)") do - proc = oneArrayDeviceProc(myid(), device_id) - DEVICES[device_id] = dev - driver!(dev.driver) - DRIVERS[device_id] = dev.driver - device!(dev) - ctx = ZeContext(dev.driver) - CONTEXTS[device_id] = ctx - return proc - end - end - end -end - -end # module IntelExt diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl deleted file mode 100644 index e4580c6..0000000 --- a/ext/MetalExt.jl +++ /dev/null @@ -1,377 +0,0 @@ -module MetalExt - -export MtlArrayDeviceProc - -import Dagger, DaggerGPU, MemPool -import Dagger: CPURAMMemorySpace, Chunk, unwrap -import MemPool: DRef, poolget -import Distributed: myid, remotecall_fetch -import LinearAlgebra -using KernelAbstractions, Adapt - -const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc} - -if isdefined(Base, :get_extension) - import Metal -else - import ..Metal -end -import Metal: MtlArray, MetalBackend -# FIXME: import Metal: MTLBLAS, MTLSOLVER -const MtlDevice = Metal.MTL.MTLDeviceInstance -const MtlStream = Metal.MTL.MTLCommandQueue - -struct MtlArrayDeviceProc <: Dagger.Processor - owner::Int - device_id::UInt64 -end - -Dagger.get_parent(proc::MtlArrayDeviceProc) = Dagger.OSProc(proc.owner) -Dagger.root_worker_id(proc::MtlArrayDeviceProc) = proc.owner -Dagger.short_name(proc::MtlArrayDeviceProc) = "W: $(proc.owner), Metal: $(proc.device_id)" -DaggerGPU.@gpuproc(MtlArrayDeviceProc, MtlArray) - -"Represents the memory space of a single Metal GPU's VRAM." -struct MetalVRAMMemorySpace <: Dagger.MemorySpace - owner::Int - device_id::Int -end -Dagger.root_worker_id(space::MetalVRAMMemorySpace) = space.owner -function Dagger.memory_space(x::MtlArray) - dev = Metal.device(x) - device_id = _device_id(dev) - return MetalVRAMMemorySpace(myid(), device_id) -end -_device_id(dev::MtlDevice) = findfirst(other_dev->other_dev === dev, Metal.devices()) - -Dagger.memory_spaces(proc::MtlArrayDeviceProc) = Set([MetalVRAMMemorySpace(proc.owner, proc.device_id)]) -Dagger.processors(space::MetalVRAMMemorySpace) = Set([MtlArrayDeviceProc(space.owner, space.device_id)]) - -function to_device(proc::MtlArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return DEVICES[proc.device_id] -end -function to_context(proc::MtlArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return Metal.global_queue() #CONTEXTS[proc.device] -end -to_context(device_id::Integer) = Metal.global_queue(DEVICES[device_id]) -to_context(dev::MtlDevice) = to_context(_device_id(dev)) - -function with_context!(device_id::Integer) -end -function with_context!(proc::MtlArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() -end -function with_context!(space::MetalVRAMMemorySpace) - @assert Dagger.root_worker_id(space) == myid() -end -function with_context(f, x) - with_context!(x) - return f() -end - -function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) - if Dagger.root_worker_id(x) == myid() - with_context(Metal.synchronize, x) - else - # Do nothing, as we have received our value over a serialization - # boundary, which should synchronize for us - end -end - -# Allocations -Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(rand)) = Metal.rand -Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(randn)) = Metal.randn -Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(ones)) = Metal.ones -Dagger.allocate_array_func(::MtlArrayDeviceProc, ::typeof(zeros)) = Metal.zeros -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = MtlArray{S,N}(undef, dims) -Dagger.allocate_array_func(::MtlArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() - -# In-place -# N.B. These methods assume that later operations will implicitly or -# explicitly synchronize with their associated stream -function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::MetalVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - if Dagger.root_worker_id(from_space) == myid() - sync_with_context(from_space) - with_context!(from_space) - end - copyto!(to, from) - # N.B. DtoH will synchronize - return -end -function Dagger.move!(to_space::MetalVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - with_context!(to_space) - copyto!(to, from) - return -end -function Dagger.move!(to_space::MetalVRAMMemorySpace, from_space::MetalVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - sync_with_context(from_space) - with_context!(to_space) - copyto!(to, from) - return -end - -# Out-of-place HtoD -function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x) - with_context(to_proc) do - arr = adapt(MtlArray, x) - Metal.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - cpu_data = remotecall_fetch(unwrap, from_w, x) - with_context(to_proc) do - arr = adapt(MtlArray, cpu_data) - Metal.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::MtlArray) - if Metal.device(x) == to_device(to_proc) - return x - end - with_context(to_proc) do - _x = similar(x) - copyto!(_x, x) - Metal.synchronize() - return _x - end -end - -# Out-of-place DtoH -function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x) - with_context(from_proc) do - Metal.synchronize() - _x = adapt(Array, x) - Metal.synchronize() - return _x - end -end -function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - remotecall_fetch(from_w, x) do x - arr = unwrap(x) - return Dagger.move(from_proc, to_proc, arr) - end -end -function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::CPUProc, x::MtlArray{T,N}) where {T,N} - with_context(from_proc) do - Metal.synchronize() - _x = Array{T,N}(undef, size(x)) - copyto!(_x, x) - Metal.synchronize() - return _x - end -end - -# Out-of-place DtoD -function Dagger.move(from_proc::MtlArrayDeviceProc, to_proc::MtlArrayDeviceProc, x::Dagger.Chunk{T}) where T<:MtlArray - if from_proc == to_proc - # Same process and GPU, no change - arr = unwrap(x) - with_context(Metal.synchronize, from_proc) - return arr - # FIXME: elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc) - else - # Different node, use DtoH, serialization, HtoD - return MtlArray(remotecall_fetch(from_proc.owner, x) do x - Array(unwrap(x)) - end) - end -end - -# Adapt generic functions -Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Function) = x -Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, x::Chunk{T}) where {T<:Function} = - Dagger.move(from_proc, to_proc, fetch(x)) - -#= FIXME: Adapt BLAS/LAPACK functions -import LinearAlgebra: BLAS, LAPACK -for lib in [BLAS, LAPACK] - for name in names(lib; all=true) - name == nameof(lib) && continue - startswith(string(name), '#') && continue - endswith(string(name), '!') || continue - - for culib in [MTLBLAS, MTLSOLVER] - if name in names(culib; all=true) - fn = getproperty(lib, name) - cufn = getproperty(culib, name) - @eval Dagger.move(from_proc::CPUProc, to_proc::MtlArrayDeviceProc, ::$(typeof(fn))) = $cufn - end - end - end -end -=# - -function DaggerGPU.move_optimized( - from_proc::CPUProc, - to_proc::MtlArrayDeviceProc, - x::Array -) - # FIXME - return nothing - - # If we have unified memory, we can try casting the `Array` to `MtlArray`. - device = _get_metal_device(to_proc) - - if (device !== nothing) && device.hasUnifiedMemory - marray = _cast_array_to_mtlarray(x, device) - marray !== nothing && return marray - end - - return nothing -end -function DaggerGPU.move_optimized( - from_proc::MtlArrayDeviceProc, - to_proc::CPUProc, - x::Array -) - # FIXME - return nothing - - # If we have unified memory, we can just cast the `MtlArray` to an `Array`. - device = _get_metal_device(from_proc) - - if (device !== nothing) && device.hasUnifiedMemory - return unsafe_wrap(Array{T}, x, size(x)) - end - - return nothing -end - -# Task execution -function Dagger.execute!(proc::MtlArrayDeviceProc, f, args...; kwargs...) - @nospecialize f args kwargs - tls = Dagger.get_tls() - task = Threads.@spawn begin - Dagger.set_tls!(tls) - with_context!(proc) - result = Base.@invokelatest f(args...; kwargs...) - # N.B. Synchronization must be done when accessing result or args - return result - end - - try - fetch(task) - catch err - stk = current_exceptions(task) - err, frames = stk[1] - rethrow(CapturedException(err, frames)) - end -end - -function Base.show(io::IO, proc::MtlArrayDeviceProc) - print(io, "MtlArrayDeviceProc(worker $(proc.owner), device $(something(_get_metal_device(proc)).name))") -end - -DaggerGPU.processor(::Val{:Metal}) = MtlArrayDeviceProc -DaggerGPU.cancompute(::Val{:Metal}) = Metal.functional() -DaggerGPU.kernel_backend(proc::MtlArrayDeviceProc) = MetalBackend() -# TODO: Switch devices -DaggerGPU.with_device(f, proc::MtlArrayDeviceProc) = f() - -Dagger.to_scope(::Val{:metal_gpu}, sc::NamedTuple) = - Dagger.to_scope(Val{:metal_gpus}(), merge(sc, (;metal_gpus=[sc.metal_gpu]))) -Dagger.scope_key_precedence(::Val{:metal_gpu}) = 1 -function Dagger.to_scope(::Val{:metal_gpus}, sc::NamedTuple) - if haskey(sc, :worker) - workers = Int[sc.worker] - elseif haskey(sc, :workers) && sc.workers != Colon() - workers = sc.workers - else - workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context())) - end - scopes = Dagger.ExactScope[] - dev_ids = sc.metal_gpus - for worker in workers - procs = Dagger.get_processors(Dagger.OSProc(worker)) - for proc in procs - proc isa MtlArrayDeviceProc || continue - if dev_ids == Colon() || proc.device_id in dev_ids - scope = Dagger.ExactScope(proc) - push!(scopes, scope) - end - end - end - return Dagger.UnionScope(scopes) -end -Dagger.scope_key_precedence(::Val{:metal_gpus}) = 1 - -const DEVICES = Dict{Int, MtlDevice}() -#const CONTEXTS = Dict{Int, MtlContext}() -#const STREAMS = Dict{Int, MtlStream}() - -function __init__() - if Metal.functional() - for (dev_id, dev) in enumerate(Metal.devices()) - @debug "Registering Metal GPU processor with Dagger: $dev" - # FIXME: We only get a Ptr now - Dagger.add_processor_callback!("metal_device_$(dev_id)") do - proc = MtlArrayDeviceProc(myid(), dev_id) - DEVICES[dev_id] = dev - #= - ctx = context(dev) - CONTEXTS[dev.device_id] = ctx - context!(ctx) do - STREAMS[dev.device_id] = stream() - end - =# - return proc - end - end - end -end - - -################################################################################ -# Private functions -################################################################################ - -# Try casting the array `x` to an `MtlArray`. If the casting is not possible, -# return `nothing`. -function _cast_array_to_mtlarray(x::Array{T,N}, device::MtlDevice) where {T,N} - # Try creating the buffer without copying. - dims = size(x) - nbytes_array = prod(dims) * sizeof(T) - pagesize = ccall(:getpagesize, Cint, ()) - num_pages = div(nbytes_array, pagesize, RoundUp) - nbytes = num_pages * pagesize - - pbuf = Metal.MTL.mtDeviceNewBufferWithBytesNoCopy( - device, - pointer(x), - nbytes, - Metal.Shared | Metal.MTL.DefaultTracking | Metal.MTL.DefaultCPUCache - ) - - if pbuf != C_NULL - buf = MtlBuffer(pbuf) - marray = MtlArray{T,N}(buf, dims) - return marray - end - - # If we reached here, the conversion was not possible. - return nothing -end - -# Return the Metal device handler given the ID recorded in `proc`. -function _get_metal_device(proc::MtlArrayDeviceProc) - devices = Metal.devices() - - if devices === nothing - return nothing - else - return devices[proc.device_id] - end -end - -end # module MetalExt diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl deleted file mode 100644 index 1f66589..0000000 --- a/ext/ROCExt.jl +++ /dev/null @@ -1,315 +0,0 @@ -module ROCExt - -export ROCArrayDeviceProc - -import Dagger, DaggerGPU, MemPool -import Dagger: CPURAMMemorySpace, Chunk, unwrap -import MemPool: DRef, poolget -import Distributed: myid, remotecall_fetch -import LinearAlgebra -using KernelAbstractions, Adapt - -const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc} - -if isdefined(Base, :get_extension) - import AMDGPU -else - import ..AMDGPU -end -import AMDGPU: HIPDevice, HIPContext, HIPStream, ROCArray, ROCBackend -import AMDGPU: devices, context, context!, stream, stream! -import AMDGPU: rocBLAS, rocSOLVER - -struct ROCArrayDeviceProc <: Dagger.Processor - owner::Int - device_id::Int -end -Dagger.get_parent(proc::ROCArrayDeviceProc) = Dagger.OSProc(proc.owner) -Dagger.root_worker_id(proc::ROCArrayDeviceProc) = proc.owner -Base.show(io::IO, proc::ROCArrayDeviceProc) = - print(io, "ROCArrayDeviceProc(worker $(proc.owner), device $(proc.device_id))") -Dagger.short_name(proc::ROCArrayDeviceProc) = "W: $(proc.owner), ROCm: $(proc.device_id)" -DaggerGPU.@gpuproc(ROCArrayDeviceProc, ROCArray) - -"Represents the memory space of a single ROCm GPU's VRAM." -struct ROCVRAMMemorySpace <: Dagger.MemorySpace - owner::Int - device_id::Int -end -Dagger.root_worker_id(space::ROCVRAMMemorySpace) = space.owner -Dagger.memory_space(x::ROCArray) = - ROCVRAMMemorySpace(myid(), AMDGPU.device(x).device_id) - -Dagger.memory_spaces(proc::ROCArrayDeviceProc) = Set([ROCVRAMMemorySpace(proc.owner, proc.device_id)]) -Dagger.processors(space::ROCVRAMMemorySpace) = Set([ROCArrayDeviceProc(space.owner, space.device_id)]) - -function to_device(proc::ROCArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return DEVICES[proc.device_id] -end -function to_context(proc::ROCArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - return CONTEXTS[proc.device_id] -end -to_context(handle::Integer) = CONTEXTS[handle] -to_context(dev::HIPDevice) = to_context(dev.device_id) - -function with_context!(handle::Integer) - context!(CONTEXTS[handle]) - AMDGPU.device!(DEVICES[handle]) - stream!(STREAMS[handle]) -end -function with_context!(proc::ROCArrayDeviceProc) - @assert Dagger.root_worker_id(proc) == myid() - with_context!(proc.device_id) -end -function with_context!(space::ROCVRAMMemorySpace) - @assert Dagger.root_worker_id(space) == myid() - with_context!(space.device_id) -end -function with_context(f, x) - old_ctx = context() - old_device = AMDGPU.device() - old_stream = stream() - - with_context!(x) - try - f() - finally - context!(old_ctx) - AMDGPU.device!(old_device) - stream!(old_stream) - end -end - -function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) - if Dagger.root_worker_id(x) == myid() - with_context(AMDGPU.synchronize, x) - else - # Do nothing, as we have received our value over a serialization - # boundary, which should synchronize for us - end -end - -# Allocations -# FIXME: Avoids some segfaults in rocRAND -fake_rand(::Type{T}, dims::NTuple{N}) where {T,N} = ROCArray(rand(T, dims)) -fake_randn(::Type{T}, dims::NTuple{N}) where {T,N} = ROCArray(randn(T, dims)) -Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(rand)) = fake_rand -Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(randn)) = fake_randn -Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(ones)) = AMDGPU.ones -Dagger.allocate_array_func(::ROCArrayDeviceProc, ::typeof(zeros)) = AMDGPU.zeros -struct AllocateUndef{S} end -(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = ROCArray{S,N}(undef, dims) -Dagger.allocate_array_func(::ROCArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}() - -# In-place -# N.B. These methods assume that later operations will implicitly or -# explicitly synchronize with their associated stream -function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::ROCVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - if Dagger.root_worker_id(from_space) == myid() - sync_with_context(from_space) - with_context!(from_space) - end - copyto!(to, from) - # N.B. DtoH will synchronize - return -end -function Dagger.move!(to_space::ROCVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - with_context!(to_space) - copyto!(to, from) - return -end -function Dagger.move!(to_space::ROCVRAMMemorySpace, from_space::ROCVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N} - sync_with_context(from_space) - with_context!(to_space) - copyto!(to, from) - return -end - -# Out-of-place HtoD -function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x) - with_context(to_proc) do - arr = adapt(ROCArray, x) - AMDGPU.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - cpu_data = remotecall_fetch(unwrap, from_w, x) - with_context(to_proc) do - arr = adapt(ROCArray, cpu_data) - AMDGPU.synchronize() - return arr - end -end -function Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::ROCArray) - if AMDGPU.device(x) == to_device(to_proc) - return x - end - with_context(to_proc) do - _x = similar(x) - copyto!(_x, x) - AMDGPU.synchronize() - return _x - end -end - -# Out-of-place DtoH -function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x) - with_context(from_proc) do - AMDGPU.synchronize() - _x = adapt(Array, x) - AMDGPU.synchronize() - return _x - end -end -function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x::Chunk) - from_w = Dagger.root_worker_id(from_proc) - to_w = Dagger.root_worker_id(to_proc) - @assert myid() == to_w - remotecall_fetch(from_w, x) do x - arr = unwrap(x) - return Dagger.move(from_proc, to_proc, arr) - end -end -function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::CPUProc, x::ROCArray{T,N}) where {T,N} - with_context(AMDGPU.device(x).device_id) do - AMDGPU.synchronize() - _x = Array{T,N}(undef, size(x)) - copyto!(_x, x) - AMDGPU.synchronize() - return _x - end -end - -# Out-of-place DtoD -function Dagger.move(from_proc::ROCArrayDeviceProc, to_proc::ROCArrayDeviceProc, x::Dagger.Chunk{T}) where T<:ROCArray - if from_proc == to_proc - # Same process and GPU, no change - arr = unwrap(x) - with_context(AMDGPU.synchronize, from_proc) - return arr - elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc) - # Same process but different GPUs, use DtoD copy - from_arr = unwrap(x) - dev = AMDGPU.device(from_arr) - with_context(AMDGPU.synchronize, dev.device_id) - return with_context(to_proc) do - to_arr = similar(from_arr) - copyto!(to_arr, from_arr) - AMDGPU.synchronize() - to_arr - end - else - # Different node, use DtoH, serialization, HtoD - return ROCArray(remotecall_fetch(from_proc.owner, x) do x - Array(unwrap(x)) - end) - end -end - -# Adapt generic functions -Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Function) = x -Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, x::Chunk{T}) where {T<:Function} = - Dagger.move(from_proc, to_proc, fetch(x)) - -# Adapt BLAS/LAPACK functions -import LinearAlgebra: BLAS, LAPACK -for lib in [BLAS, LAPACK] - for name in names(lib; all=true) - name == nameof(lib) && continue - startswith(string(name), '#') && continue - endswith(string(name), '!') || continue - - for roclib in [rocBLAS, rocSOLVER] - if name in names(roclib; all=true) - fn = getproperty(lib, name) - rocfn = getproperty(roclib, name) - @eval Dagger.move(from_proc::CPUProc, to_proc::ROCArrayDeviceProc, ::$(typeof(fn))) = $rocfn - end - end - end -end - -# Task execution -function Dagger.execute!(proc::ROCArrayDeviceProc, f, args...; kwargs...) - @nospecialize f args kwargs - tls = Dagger.get_tls() - task = Threads.@spawn begin - Dagger.set_tls!(tls) - with_context!(proc) - result = Base.@invokelatest f(args...; kwargs...) - # N.B. Synchronization must be done when accessing result or args - return result - end - - try - fetch(task) - catch err - stk = current_exceptions(task) - err, frames = stk[1] - rethrow(CapturedException(err, frames)) - end -end - -DaggerGPU.processor(::Val{:ROC}) = ROCArrayDeviceProc -DaggerGPU.cancompute(::Val{:ROC}) = AMDGPU.functional() -DaggerGPU.kernel_backend(proc::ROCArrayDeviceProc) = ROCBackend() -DaggerGPU.with_device(f, proc::ROCArrayDeviceProc) = - AMDGPU.device!(f, AMDGPU.devices()[proc.device_id]) - -Dagger.to_scope(::Val{:rocm_gpu}, sc::NamedTuple) = - Dagger.to_scope(Val{:rocm_gpus}(), merge(sc, (;rocm_gpus=[sc.rocm_gpu]))) -function Dagger.to_scope(::Val{:rocm_gpus}, sc::NamedTuple) - if haskey(sc, :worker) - workers = Int[sc.worker] - elseif haskey(sc, :workers) && sc.workers != Colon() - workers = sc.workers - else - workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context())) - end - scopes = Dagger.ExactScope[] - dev_ids = sc.rocm_gpus - for worker in workers - procs = Dagger.get_processors(Dagger.OSProc(worker)) - for proc in procs - proc isa ROCArrayDeviceProc || continue - if dev_ids == Colon() || proc.device_id in dev_ids - scope = Dagger.ExactScope(proc) - push!(scopes, scope) - end - end - end - return Dagger.UnionScope(scopes) -end -Dagger.scope_key_precedence(::Val{:rocm_gpu}) = 2 -Dagger.scope_key_precedence(::Val{:rocm_gpus}) = 1 - -const DEVICES = Dict{Int, HIPDevice}() -const CONTEXTS = Dict{Int, HIPContext}() -const STREAMS = Dict{Int, HIPStream}() - -function __init__() - if AMDGPU.functional() - for device_id in 1:length(AMDGPU.devices()) - dev = AMDGPU.devices()[device_id] - @debug "Registering ROCm GPU processor with Dagger: $dev" - Dagger.add_processor_callback!("rocarray_device_$device_id") do - proc = ROCArrayDeviceProc(myid(), device_id) - DEVICES[dev.device_id] = dev - ctx = HIPContext(dev) - CONTEXTS[dev.device_id] = ctx - context!(ctx) do - STREAMS[dev.device_id] = HIPStream() - end - return proc - end - end - end -end - -end # module ROCExt diff --git a/src/DaggerGPU.jl b/src/DaggerGPU.jl index 1d8cbce..999523a 100644 --- a/src/DaggerGPU.jl +++ b/src/DaggerGPU.jl @@ -1,64 +1,9 @@ module DaggerGPU -using Dagger, MemPool -using Distributed -using KernelAbstractions, Adapt +import Dagger: Kernel, gpu_processor, gpu_can_compute, with_device, move_optimized, gpu_kernel_backend -import Dagger: Chunk -import LinearAlgebra - -const CPUProc = Union{OSProc, Dagger.ThreadProc} - -struct Kernel{F} end -Kernel(f) = Kernel{f}() - -function (::Kernel{F})(args...; ndrange) where F - @nospecialize args - dev = kernel_backend() - kern = F(dev) - kern(args...; ndrange) - KernelAbstractions.synchronize(dev) -end - -macro gpuproc(PROC, T) - PROC = esc(PROC) - T = esc(T) - quote - # Assume that we can run anything - Dagger.iscompatible_func(proc::$PROC, opts, f) = true - Dagger.iscompatible_arg(proc::$PROC, opts, x) = true - - # CPUs shouldn't process our array type - Dagger.iscompatible_arg(proc::Dagger.ThreadProc, opts, x::$T) = false - end -end - -processor(kind::Symbol) = processor(Val(kind)) -processor(::Val) = Dagger.ThreadProc -cancompute(kind::Symbol) = cancompute(Val(kind)) -cancompute(::Val) = false -function with_device end - -move_optimized(from_proc::Dagger.Processor, - to_proc::Dagger.Processor, - x) = nothing - -kernel_backend() = kernel_backend(Dagger.Sch.thunk_processor()) -kernel_backend(::Dagger.ThreadProc) = CPU() - -using Requires -@static if !isdefined(Base, :get_extension) -function __init__() - @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin - include(joinpath(@__DIR__, "..", "ext", "CUDAExt.jl")) - end - @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin - include(joinpath(@__DIR__, "..", "ext", "ROCExt.jl")) - end - @require Metal="dde4c033-4e86-420c-a63e-0dd931031962" begin - include(joinpath(@__DIR__, "..", "ext", "MetalExt.jl")) - end -end -end +@deprecate processor gpu_processor +@deprecate cancompute gpu_can_compute +@deprecate kernel_backend gpu_kernel_backend end diff --git a/test/runtests.jl b/test/runtests.jl index 89982c4..14f598a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -105,12 +105,14 @@ end if !DaggerGPU.cancompute(:CUDA) @warn "No CUDA devices available, skipping tests" else + #= cuproc = if isdefined(Base, :get_extension) Base.get_extension(DaggerGPU, :CUDAExt).CuArrayDeviceProc else CuArrayDeviceProc end @test DaggerGPU.processor(:CUDA) === cuproc + =# ndevices = length(collect(CUDA.devices())) gpu_configs = Any[1] if ndevices > 1 @@ -222,12 +224,14 @@ end if !DaggerGPU.cancompute(:ROC) @warn "No ROCm devices available, skipping tests" else + #= rocproc = if isdefined(Base, :get_extension) Base.get_extension(DaggerGPU, :ROCExt).ROCArrayDeviceProc else ROCArrayDeviceProc end @test DaggerGPU.processor(:ROC) === rocproc + =# ndevices = length(AMDGPU.devices()) gpu_configs = Any[1] if ndevices > 1 @@ -339,12 +343,14 @@ end if !DaggerGPU.cancompute(:oneAPI) @warn "No oneAPI devices available, skipping tests" else + #= oneproc = if isdefined(Base, :get_extension) Base.get_extension(DaggerGPU, :IntelExt).oneArrayDeviceProc else oneArrayDeviceProc end @test DaggerGPU.processor(:oneAPI) === oneproc + =# ndevices = length(oneAPI.devices()) gpu_configs = Any[1] if ndevices > 1 @@ -460,12 +466,14 @@ end if !DaggerGPU.cancompute(:Metal) @warn "No Metal devices available, skipping tests" else + #= mtlproc = if isdefined(Base, :get_extension) Base.get_extension(DaggerGPU, :MetalExt).MtlArrayDeviceProc else MtlArrayDeviceProc end @test DaggerGPU.processor(:Metal) === mtlproc + =# b = generate_thunks() c = Dagger.with_options(;scope=Dagger.scope(metal_gpu=1)) do @test fetch(Dagger.@spawn isongpu(b))