Skip to content

Commit 85fa7d3

Browse files
authored
Merge pull request #10 from JuliaGPU/jps/ka-support
Add KernelAbstractions support
2 parents d8a65d9 + fcb618c commit 85fa7d3

File tree

6 files changed

+67
-49
lines changed

6 files changed

+67
-49
lines changed

Project.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
name = "DaggerGPU"
22
uuid = "68e73e28-2238-4d5a-bf97-e5d4aa3c4be2"
33
authors = ["Julian P Samaroo <[email protected]>"]
4-
version = "0.1.1"
4+
version = "0.1.2"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
88
Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
99
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
10+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1011
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1112

1213
[compat]
1314
Adapt = "1, 2"
14-
Dagger = "0.10"
15+
Dagger = "0.10.0"
1516
Requires = "1"
1617
julia = "1"
1718

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
**GPU integrations for Dagger.jl**
44

5-
DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
5+
DaggerGPU.jl makes use of the `Dagger.Processor` infrastructure to dispatch Dagger kernels to NVIDIA and AMD GPUs, via CUDA.jl and AMDGPU.jl respectively. Usage is simple: `add` or `dev` DaggerGPU.jl and CUDA.jl/AMDGPU.jl appropriately, load it with `using DaggerGPU`, and add `DaggerGPU.CuArrayDeviceProc`/`DaggerGPU.ROCArrayProc` to your scheduler or thunk options (see Dagger.jl documentation for details on how to do this).
66

7-
DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide your:
7+
DaggerGPU.jl is still experimental, but we welcome GPU-owning users to try it out and report back on any issues or sharp edges that they encounter. When filing an issue about DaggerGPU.jl, please provide:
88
- The complete error message and backtrace
99
- Julia version
1010
- GPU vendor and model

src/DaggerGPU.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ module DaggerGPU
22

33
using Dagger, Requires, Adapt
44
using Distributed
5+
using KernelAbstractions
56

67
macro gpuproc(PROC, T)
78
quote
@@ -23,6 +24,9 @@ processor(::Val) = Dagger.ThreadProc
2324
cancompute(kind::Symbol) = cancompute(Val(kind))
2425
cancompute(::Val) = false
2526

27+
kernel_backend() = kernel_backend(Dagger.Sch.thunk_processor())
28+
kernel_backend(::Dagger.ThreadProc) = CPU()
29+
2630
function __init__()
2731
@require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
2832
include("cu.jl")

src/cu.jl

Lines changed: 12 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
using .CUDA
22
import .CUDA: CuDevice, CuContext, devices, attribute
33

4-
export CuArrayProc, CuArrayDeviceProc, CuArraySMProc
4+
export CuArrayDeviceProc
55

66
"Represents a single CUDA GPU device."
77
struct CuArrayDeviceProc <: Dagger.Processor
88
owner::Int
99
#ctx::CuContext
10-
device::CuDevice
10+
device::Int
1111
end
1212
@gpuproc(CuArrayDeviceProc, CuArray)
13-
const CuArrayProc = CuArrayDeviceProc
14-
#= FIXME: CUDA IPC
13+
#= FIXME: DtoD copies and CUDA IPC
1514
function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
1615
if from === to
1716
return x
@@ -21,45 +20,23 @@ function Dagger.move(from::CuArrayDeviceProc, to::CuArrayDeviceProc, x)
2120
end
2221
=#
2322
function Dagger.execute!(proc::CuArrayDeviceProc, func, args...)
24-
#CUDA.context!(proc.ctx)
25-
CUDA.@sync func(args...)
26-
end
27-
28-
"Represents a single CUDA GPU Streaming Multiprocessor."
29-
struct CuArraySMProc <: Dagger.Processor
30-
owner::Int
31-
#ctx::CuContext
32-
device::CuDevice
33-
sm::Int
34-
end
35-
@gpuproc(CuArraySMProc, CuArray)
36-
#= FIXME: CUDA IPC
37-
function Dagger.move(from::CuArraySMProc, to::CuArraySMProc, x)
38-
if from.device === to.device
39-
return x
40-
else
41-
error("Not implemented")
42-
end
43-
end
44-
=#
45-
function Dagger.execute!(proc::CuArraySMProc, func, args...)
46-
#CUDA.context!(proc.ctx)
47-
CUDA.@sync func(args...)
23+
fetch(Threads.@spawn begin
24+
task_local_storage(:processor, proc)
25+
CUDA.device!(proc.device)
26+
CUDA.@sync func(args...)
27+
end)
4828
end
29+
Base.show(io::IO, proc::CuArrayDeviceProc) =
30+
print(io, "CuArrayDeviceProc on worker $(proc.owner), device $(proc.device)")
4931

5032
processor(::Val{:CUDA}) = CuArrayDeviceProc
5133
cancompute(::Val{:CUDA}) = CUDA.has_cuda()
52-
# TODO: CuArraySMProc
34+
kernel_backend(::CuArrayDeviceProc) = CUDADevice()
5335

5436
if CUDA.has_cuda()
5537
for dev in devices()
5638
Dagger.add_callback!(proc -> begin
57-
return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev)
39+
return CuArrayDeviceProc(Distributed.myid(), #=CuContext(dev),=# dev.handle)
5840
end)
59-
for i in 1:attribute(dev, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
60-
Dagger.add_callback!(proc -> begin
61-
return CuArraySMProc(Distributed.myid(), #=CuContext(dev),=# dev, i)
62-
end)
63-
end
6441
end
6542
end

src/roc.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Dagger.execute!(proc::ROCArrayProc, func, args...) = func(args...)
1010

1111
processor(::Val{:ROC}) = ROCArrayProc
1212
cancompute(::Val{:ROC}) = AMDGPU.configured
13+
# FIXME: kernel_backend(::ROCDevice) = ROCArrayProc
1314

1415
if AMDGPU.configured
1516
Dagger.add_callback!(proc -> begin

test/runtests.jl

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
using Distributed
22
using Test
3-
addprocs(2)
3+
addprocs(2, exeflags="--project")
44

55
@everywhere begin
6-
7-
using Distributed, Dagger, DaggerGPU
8-
using CUDA, AMDGPU
9-
10-
function myfunc(X)
11-
@assert !(X isa Array)
12-
X
6+
using Distributed, Dagger, DaggerGPU
7+
using CUDA, AMDGPU, KernelAbstractions
138
end
9+
@everywhere begin
10+
function myfunc(X)
11+
@assert !(X isa Array)
12+
X
13+
end
1414

15+
KernelAbstractions.@kernel function fill_kernel(A, x)
16+
idx = @index(Global, Linear)
17+
A[idx] = x
18+
end
19+
function fill_thunk(A, x)
20+
k = fill_kernel(DaggerGPU.kernel_backend(), 8)
21+
wait(k(A, x; ndrange=8))
22+
@show A
23+
A
24+
end
1525
end
1626

1727
function generate_thunks()
@@ -21,11 +31,18 @@ end
2131

2232
@test DaggerGPU.cancompute(:CUDA) || DaggerGPU.cancompute(:ROC)
2333

34+
@testset "CPU" begin
35+
@testset "KernelAbstractions" begin
36+
A = rand(Float32, 8)
37+
_A = collect(delayed(fill_thunk)(A, 2.3))
38+
@test all(_A .== 2.3)
39+
end
40+
end
41+
2442
@testset "CUDA" begin
2543
if !DaggerGPU.cancompute(:CUDA)
2644
@warn "No CUDA devices available, skipping tests"
2745
else
28-
didtest = true
2946
cuproc = DaggerGPU.processor(:CUDA)
3047
b = generate_thunks()
3148
opts = Dagger.Sch.ThunkOptions(;proctypes=[cuproc])
@@ -35,14 +52,21 @@ end
3552
opts = Dagger.Sch.ThunkOptions(;proctypes=[Dagger.ThreadProc])
3653
d = delayed(identity; options=opts)(c)
3754
@test collect(d) == 20
55+
56+
@testset "KernelAbstractions" begin
57+
cuproc = DaggerGPU.processor(:CUDA)
58+
opts = Dagger.Sch.ThunkOptions(;proctypes=[cuproc])
59+
A = rand(Float32, 8)
60+
_A = collect(delayed(fill_thunk)(A, 2.3); options=opts)
61+
@test all(_A .== 2.3)
62+
end
3863
end
3964
end
4065

4166
@testset "ROCm" begin
4267
if !DaggerGPU.cancompute(:ROC)
4368
@warn "No ROCm devices available, skipping tests"
4469
else
45-
didtest = true
4670
rocproc = DaggerGPU.processor(:ROC)
4771
b = generate_thunks()
4872
opts = Dagger.Sch.ThunkOptions(;proctypes=[rocproc])
@@ -52,5 +76,16 @@ end
5276
opts = Dagger.Sch.ThunkOptions(;proctypes=[Dagger.ThreadProc])
5377
d = delayed(identity; options=opts)(c)
5478
@test collect(d) == 20
79+
80+
@test_skip "KernelAbstractions"
81+
#= FIXME
82+
@testset "KernelAbstractions" begin
83+
rocproc = DaggerGPU.processor(:ROC)
84+
opts = Dagger.Sch.ThunkOptions(;proctypes=[rocproc])
85+
A = rand(Float32, 8)
86+
_A = collect(delayed(fill_thunk)(A, 2.3); options=opts)
87+
@test all(_A .== 2.3)
88+
end
89+
=#
5590
end
5691
end

0 commit comments

Comments
 (0)