Merge pull request #10 from JuliaGPU/jps/ka-support

jpsamaroo · web-flow · commit 85fa7d377a06 · 2020-12-01T20:26:38.000-06:00
Add KernelAbstractions support
diff --git a/Project.toml b/Project.toml
@@ -1,17 +1,18 @@
 name = "DaggerGPU"
 uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
 Adapt = "1, 2"
-Dagger = "0.10"
+Dagger = "0.10.0"
 Requires = "1"
 julia = "1"
 
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 **GPU integrations for Dagger.jl**
 
-DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
+DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
 
-DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide your:
+DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide:
 - The complete error message and backtrace
 - Julia version
 - GPU vendor and model
diff --git a/src/DaggerGPU.jl b/src/DaggerGPU.jl
@@ -2,6 +2,7 @@ module DaggerGPU
 
 using Dagger, Requires, Adapt
 using Distributed
+using KernelAbstractions
 
 macro gpuproc(PROC, T)
     quote
@@ -23,6 +24,9 @@ processor(::Val) = Dagger.ThreadProc
 cancompute(kind::Symbol) = cancompute(Val(kind))
 cancompute(::Val) = false
 
+kernel_backend() = kernel_backend(Dagger.Sch.thunk_processor())
+kernel_backend(::Dagger.ThreadProc) = CPU()
+
 function __init__()
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
         include("cu.jl")
diff --git a/src/cu.jl b/src/cu.jl
@@ -1,17 +1,16 @@
 using .CUDA
 import .CUDA: CuDevice, CuContext, devices, attribute
 
-export CuArrayProc, CuArrayDeviceProc, CuArraySMProc
+export CuArrayDeviceProc
 
 "Represents a single CUDA GPU device."
 struct CuArrayDeviceProc <: Dagger.Processor
     owner::Int
     #ctx::CuContext
-    device::CuDevice
+    device::Int
 end
 @gpuproc(CuArrayDeviceProc, CuArray)
-const CuArrayProc = CuArrayDeviceProc
-#= FIXME: CUDA IPC
+#= FIXME: DtoD copies and CUDA IPC
 function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
     if from === to
         return x
@@ -21,45 +20,23 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
 end
 =#
 function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
-    #CUDA.context!(proc.ctx)
-    CUDA.@sync func(args...)
-end
-
-"Represents a single CUDA GPU Streaming Multiprocessor."
-struct CuArraySMProc <: Dagger.Processor
-    owner::Int
-    #ctx::CuContext
-    device::CuDevice
-    sm::Int
-end
-@gpuproc(CuArraySMProc, CuArray)
-#= FIXME: CUDA IPC
-function Dagger.move(from::CuArraySMProc, to::CuArraySMProc, x)
-    if from.device === to.device
-        return x
-    else
-        error("Not implemented")
-    end
-end
-=#
-function Dagger.execute!(proc::CuArraySMProc, func, args...)
-    #CUDA.context!(proc.ctx)
-    CUDA.@sync func(args...)
+    fetch(Threads.@spawn begin
+        task_local_storage(:processor, proc)
+        CUDA.device!(proc.device)
+        CUDA.@sync func(args...)
+    end)
 end
+Base.show(io::IO, proc::CuArrayDeviceProc) =
+    print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device)")
 
 processor(::Val{:CUDA}) = CuArrayDeviceProc
 cancompute(::Val{:CUDA}) = CUDA.has_cuda()
-# TODO: CuArraySMProc
+kernel_backend(::CuArrayDeviceProc) = CUDADevice()
 
 if CUDA.has_cuda()
     for dev in devices()
         Dagger.add_callback!(proc -> begin
-            return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev)
+            return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev.handle)
         end)
-        for i in 1:attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
-            Dagger.add_callback!(proc -> begin
-                return CuArraySMProc(Distributed.myid(), #=CuContext(dev),=# dev, i)
-            end)
-        end
     end
 end
diff --git a/src/roc.jl b/src/roc.jl
@@ -10,6 +10,7 @@ Dagger.execute!(proc::ROCArrayProc, func, args...) = func(args...)
 
 processor(::Val{:ROC}) = ROCArrayProc
 cancompute(::Val{:ROC}) = AMDGPU.configured
+# FIXME: kernel_backend(::ROCDevice) = ROCArrayProc
 
 if AMDGPU.configured
     Dagger.add_callback!(proc -> begin
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,17 +1,27 @@
 using Distributed
 using Test
-addprocs(2)
+addprocs(2, exeflags="--project")
 
 @everywhere begin
-
-using Distributed, Dagger, DaggerGPU
-using CUDA, AMDGPU
-
-function myfunc(X)
-    @assert !(X isa Array)
-    X
+    using Distributed, Dagger, DaggerGPU
+    using CUDA, AMDGPU, KernelAbstractions
 end
+@everywhere begin
+    function myfunc(X)
+        @assert !(X isa Array)
+        X
+    end
 
+    KernelAbstractions.@kernel function fill_kernel(A, x)
+        idx = @index(Global, Linear)
+        A[idx] = x
+    end
+    function fill_thunk(A, x)
+        k = fill_kernel(DaggerGPU.kernel_backend(), 8)
+        wait(k(A, x; ndrange=8))
+        @show A
+        A
+    end
 end
 
 function generate_thunks()
@@ -21,11 +31,18 @@ end
 
 @test DaggerGPU.cancompute(:CUDA) || DaggerGPU.cancompute(:ROC)
 
+@testset "CPU" begin
+    @testset "KernelAbstractions" begin
+        A = rand(Float32, 8)
+        _A = collect(delayed(fill_thunk)(A, 2.3))
+        @test all(_A .== 2.3)
+    end
+end
+
 @testset "CUDA" begin
     if !DaggerGPU.cancompute(:CUDA)
         @warn "No CUDA devices available, skipping tests"
     else
-        didtest = true
         cuproc = DaggerGPU.processor(:CUDA)
         b = generate_thunks()
         opts = Dagger.Sch.ThunkOptions(;proctypes=[cuproc])
@@ -35,14 +52,21 @@ end
         opts = Dagger.Sch.ThunkOptions(;proctypes=[Dagger.ThreadProc])
         d = delayed(identity; options=opts)(c)
         @test collect(d) == 20
+
+        @testset "KernelAbstractions" begin
+            cuproc = DaggerGPU.processor(:CUDA)
+            opts = Dagger.Sch.ThunkOptions(;proctypes=[cuproc])
+            A = rand(Float32, 8)
+            _A = collect(delayed(fill_thunk)(A, 2.3); options=opts)
+            @test all(_A .== 2.3)
+        end
     end
 end
 
 @testset "ROCm" begin
     if !DaggerGPU.cancompute(:ROC)
         @warn "No ROCm devices available, skipping tests"
     else
-        didtest = true
         rocproc = DaggerGPU.processor(:ROC)
         b = generate_thunks()
         opts = Dagger.Sch.ThunkOptions(;proctypes=[rocproc])
@@ -52,5 +76,16 @@ end
         opts = Dagger.Sch.ThunkOptions(;proctypes=[Dagger.ThreadProc])
         d = delayed(identity; options=opts)(c)
         @test collect(d) == 20
+
+        @test_skip "KernelAbstractions"
+        #= FIXME
+        @testset "KernelAbstractions" begin
+            rocproc = DaggerGPU.processor(:ROC)
+            opts = Dagger.Sch.ThunkOptions(;proctypes=[rocproc])
+            A = rand(Float32, 8)
+            _A = collect(delayed(fill_thunk)(A, 2.3); options=opts)
+            @test all(_A .== 2.3)
+        end
+        =#
     end
 end