Skip to content

Commit 33cf1a9

Browse files
authored
Merge pull request #37 from JuliaGPU/jps/oneapi
Add Intel (oneAPI) support
2 parents 8d85357 + 3fb9555 commit 33cf1a9

File tree

4 files changed

+462
-0
lines changed

4 files changed

+462
-0
lines changed

.buildkite/pipeline.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,21 @@ steps:
2929
env:
3030
CI_USE_ROCM: "1"
3131

32+
- label: "[oneAPI] Julia v1.10"
33+
plugins:
34+
- JuliaCI/julia#v1:
35+
version: "1.10"
36+
- JuliaCI/julia-test#v1: ~
37+
- JuliaCI/julia-coverage#v1:
38+
codecov: false # TODO
39+
agents:
40+
queue: "juliagpu"
41+
intel: "*"
42+
if: build.message !~ /\[skip tests\]/
43+
timeout_in_minutes: 20
44+
env:
45+
CI_USE_ONEAPI: "1"
46+
3247
- label: "[Metal] Julia v1.10"
3348
plugins:
3449
- JuliaCI/julia#v1:

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,19 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1313
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1414
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
1515
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
16+
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
1617
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1718
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
1819

1920
[weakdeps]
2021
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
2122
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
2223
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
24+
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
2325

2426
[extensions]
2527
CUDAExt = "CUDA"
28+
IntelExt = "oneAPI"
2629
MetalExt = "Metal"
2730
ROCExt = "AMDGPU"
2831

@@ -34,5 +37,6 @@ Dagger = "0.18.12"
3437
KernelAbstractions = "0.9"
3538
MemPool = "0.3, 0.4"
3639
Metal = "1.1"
40+
oneAPI = "1"
3741
Requires = "1"
3842
julia = "1.7"

ext/IntelExt.jl

Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
module IntelExt
2+
3+
export oneArrayDeviceProc
4+
5+
import Dagger, DaggerGPU, MemPool
6+
import Dagger: CPURAMMemorySpace, Chunk, unwrap
7+
import MemPool: DRef, poolget
8+
import Distributed: myid, remotecall_fetch
9+
import LinearAlgebra
10+
using KernelAbstractions, Adapt
11+
12+
const CPUProc = Union{Dagger.OSProc,Dagger.ThreadProc}
13+
14+
if isdefined(Base, :get_extension)
15+
import oneAPI
16+
else
17+
import ..oneAPI
18+
end
19+
import oneAPI: ZeDevice, ZeDriver, ZeContext, oneArray, oneAPIBackend
20+
import oneAPI: driver, driver!, device, device!, context, context!
21+
#import oneAPI: CUBLAS, CUSOLVER
22+
23+
using UUIDs
24+
25+
"Represents a single Intel GPU device."
26+
struct oneArrayDeviceProc <: Dagger.Processor
27+
owner::Int
28+
device_id::Int
29+
end
30+
Dagger.get_parent(proc::oneArrayDeviceProc) = Dagger.OSProc(proc.owner)
31+
Dagger.root_worker_id(proc::oneArrayDeviceProc) = proc.owner
32+
Base.show(io::IO, proc::oneArrayDeviceProc) =
33+
print(io, "oneArrayDeviceProc(worker $(proc.owner), device $(proc.device_id))")
34+
Dagger.short_name(proc::oneArrayDeviceProc) = "W: $(proc.owner), oneAPI: $(proc.device)"
35+
DaggerGPU.@gpuproc(oneArrayDeviceProc, oneArray)
36+
37+
"Represents the memory space of a single Intel GPU's VRAM."
38+
struct IntelVRAMMemorySpace <: Dagger.MemorySpace
39+
owner::Int
40+
device_id::Int
41+
end
42+
Dagger.root_worker_id(space::IntelVRAMMemorySpace) = space.owner
43+
function Dagger.memory_space(x::oneArray)
44+
dev = oneAPI.device(x)
45+
device_id = _device_id(dev)
46+
return IntelVRAMMemorySpace(myid(), device_id)
47+
end
48+
_device_id(dev::ZeDevice) = findfirst(other_dev->other_dev === dev, collect(oneAPI.devices()))
49+
50+
Dagger.memory_spaces(proc::oneArrayDeviceProc) = Set([IntelVRAMMemorySpace(proc.owner, proc.device_id)])
51+
Dagger.processors(space::IntelVRAMMemorySpace) = Set([oneArrayDeviceProc(space.owner, space.device_id)])
52+
53+
function to_device(proc::oneArrayDeviceProc)
54+
@assert Dagger.root_worker_id(proc) == myid()
55+
return DEVICES[proc.device_id]
56+
end
57+
58+
function with_context!(device_id::Integer)
59+
driver!(DRIVERS[device_id])
60+
device!(DEVICES[device_id])
61+
context!(CONTEXTS[device_id])
62+
end
63+
function with_context!(proc::oneArrayDeviceProc)
64+
@assert Dagger.root_worker_id(proc) == myid()
65+
with_context!(proc.device_id)
66+
end
67+
function with_context!(space::IntelVRAMMemorySpace)
68+
@assert Dagger.root_worker_id(space) == myid()
69+
with_context!(space.device_id)
70+
end
71+
function with_context(f, x)
72+
old_drv = driver()
73+
old_dev = device()
74+
old_ctx = context()
75+
76+
with_context!(x)
77+
try
78+
f()
79+
finally
80+
driver!(old_drv)
81+
device!(old_dev)
82+
context!(old_ctx)
83+
end
84+
end
85+
86+
function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace})
87+
if Dagger.root_worker_id(x) == myid()
88+
with_context(oneAPI.synchronize, x)
89+
else
90+
# Do nothing, as we have received our value over a serialization
91+
# boundary, which should synchronize for us
92+
end
93+
end
94+
95+
# Allocations
96+
Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(rand)) = oneAPI.rand
97+
Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(randn)) = oneAPI.randn
98+
Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(ones)) = oneAPI.ones
99+
Dagger.allocate_array_func(::oneArrayDeviceProc, ::typeof(zeros)) = oneAPI.zeros
100+
struct AllocateUndef{S} end
101+
(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = oneArray{S,N}(undef, dims)
102+
Dagger.allocate_array_func(::oneArrayDeviceProc, ::Dagger.AllocateUndef{S}) where S = AllocateUndef{S}()
103+
104+
# In-place
105+
# N.B. These methods assume that later operations will implicitly or
106+
# explicitly synchronize with their associated stream
107+
function Dagger.move!(to_space::Dagger.CPURAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
108+
if Dagger.root_worker_id(from_space) == myid()
109+
sync_with_context(from_space)
110+
with_context!(from_space)
111+
end
112+
copyto!(to, from)
113+
# N.B. DtoH will synchronize
114+
return
115+
end
116+
function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::Dagger.CPURAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
117+
with_context!(to_space)
118+
copyto!(to, from)
119+
return
120+
end
121+
function Dagger.move!(to_space::IntelVRAMMemorySpace, from_space::IntelVRAMMemorySpace, to::AbstractArray{T,N}, from::AbstractArray{T,N}) where {T,N}
122+
sync_with_context(from_space)
123+
with_context!(to_space)
124+
copyto!(to, from)
125+
return
126+
end
127+
128+
# Out-of-place HtoD
129+
function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x)
130+
with_context(to_proc) do
131+
arr = adapt(oneArray, x)
132+
oneAPI.synchronize()
133+
return arr
134+
end
135+
end
136+
function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk)
137+
from_w = Dagger.root_worker_id(from_proc)
138+
to_w = Dagger.root_worker_id(to_proc)
139+
@assert myid() == to_w
140+
cpu_data = remotecall_fetch(unwrap, from_w, x)
141+
with_context(to_proc) do
142+
arr = adapt(oneArray, cpu_data)
143+
oneAPI.synchronize()
144+
return arr
145+
end
146+
end
147+
function Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::oneArray)
148+
if oneAPI.device(x) == to_device(to_proc)
149+
return x
150+
end
151+
with_context(to_proc) do
152+
_x = similar(x)
153+
copyto!(_x, x)
154+
oneAPI.synchronize()
155+
return _x
156+
end
157+
end
158+
159+
# Out-of-place DtoH
160+
function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x)
161+
with_context(from_proc) do
162+
oneAPI.synchronize()
163+
_x = adapt(Array, x)
164+
oneAPI.synchronize()
165+
return _x
166+
end
167+
end
168+
function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::Chunk)
169+
from_w = Dagger.root_worker_id(from_proc)
170+
to_w = Dagger.root_worker_id(to_proc)
171+
@assert myid() == to_w
172+
remotecall_fetch(from_w, x) do x
173+
arr = unwrap(x)
174+
return Dagger.move(from_proc, to_proc, arr)
175+
end
176+
end
177+
function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::CPUProc, x::oneArray{T,N}) where {T,N}
178+
with_context(from_proc) do
179+
oneAPI.synchronize()
180+
_x = Array{T,N}(undef, size(x))
181+
copyto!(_x, x)
182+
oneAPI.synchronize()
183+
return _x
184+
end
185+
end
186+
187+
# Out-of-place DtoD
188+
function Dagger.move(from_proc::oneArrayDeviceProc, to_proc::oneArrayDeviceProc, x::Dagger.Chunk{T}) where T<:oneArray
189+
if from_proc == to_proc
190+
# Same process and GPU, no change
191+
arr = unwrap(x)
192+
with_context(oneAPI.synchronize, from_proc)
193+
return arr
194+
elseif Dagger.root_worker_id(from_proc) == Dagger.root_worker_id(to_proc)
195+
# Same process but different GPUs, use DtoD copy
196+
from_arr = unwrap(x)
197+
with_context(oneAPI.synchronize, from_proc)
198+
return with_context(to_proc) do
199+
to_arr = similar(from_arr)
200+
copyto!(to_arr, from_arr)
201+
oneAPI.synchronize()
202+
return to_arr
203+
end
204+
else
205+
# Different node, use DtoH, serialization, HtoD
206+
return oneArray(remotecall_fetch(from_proc.owner, x) do x
207+
Array(unwrap(x))
208+
end)
209+
end
210+
end
211+
212+
# Adapt generic functions
213+
Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Function) = x
214+
Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, x::Chunk{T}) where {T<:Function} =
215+
Dagger.move(from_proc, to_proc, fetch(x))
216+
217+
#= FIXME: Adapt BLAS/LAPACK functions
218+
import LinearAlgebra: BLAS, LAPACK
219+
for lib in [BLAS, LAPACK]
220+
for name in names(lib; all=true)
221+
name == nameof(lib) && continue
222+
startswith(string(name), '#') && continue
223+
endswith(string(name), '!') || continue
224+
225+
for culib in [CUBLAS, CUSOLVER]
226+
if name in names(culib; all=true)
227+
fn = getproperty(lib, name)
228+
cufn = getproperty(culib, name)
229+
@eval Dagger.move(from_proc::CPUProc, to_proc::oneArrayDeviceProc, ::$(typeof(fn))) = $cufn
230+
end
231+
end
232+
end
233+
end
234+
=#
235+
236+
# Task execution
237+
function Dagger.execute!(proc::oneArrayDeviceProc, f, args...; kwargs...)
238+
@nospecialize f args kwargs
239+
tls = Dagger.get_tls()
240+
task = Threads.@spawn begin
241+
Dagger.set_tls!(tls)
242+
with_context!(proc)
243+
result = Base.@invokelatest f(args...; kwargs...)
244+
# N.B. Synchronization must be done when accessing result or args
245+
return result
246+
end
247+
248+
try
249+
fetch(task)
250+
catch err
251+
stk = current_exceptions(task)
252+
err, frames = stk[1]
253+
rethrow(CapturedException(err, frames))
254+
end
255+
end
256+
257+
DaggerGPU.processor(::Val{:oneAPI}) = oneArrayDeviceProc
258+
DaggerGPU.cancompute(::Val{:oneAPI}) = oneAPI.functional()
259+
DaggerGPU.kernel_backend(::oneArrayDeviceProc) = oneAPIBackend()
260+
DaggerGPU.with_device(f, proc::oneArrayDeviceProc) =
261+
device!(f, proc.device_id)
262+
263+
Dagger.to_scope(::Val{:intel_gpu}, sc::NamedTuple) =
264+
Dagger.to_scope(Val{:intel_gpus}(), merge(sc, (;intel_gpus=[sc.intel_gpu])))
265+
Dagger.scope_key_precedence(::Val{:intel_gpu}) = 1
266+
function Dagger.to_scope(::Val{:intel_gpus}, sc::NamedTuple)
267+
if haskey(sc, :worker)
268+
workers = Int[sc.worker]
269+
elseif haskey(sc, :workers) && sc.workers != Colon()
270+
workers = sc.workers
271+
else
272+
workers = map(gproc->gproc.pid, Dagger.procs(Dagger.Sch.eager_context()))
273+
end
274+
scopes = Dagger.ExactScope[]
275+
dev_ids = sc.intel_gpus
276+
for worker in workers
277+
procs = Dagger.get_processors(Dagger.OSProc(worker))
278+
for proc in procs
279+
proc isa oneArrayDeviceProc || continue
280+
if dev_ids == Colon() || proc.device_id in dev_ids
281+
scope = Dagger.ExactScope(proc)
282+
push!(scopes, scope)
283+
end
284+
end
285+
end
286+
return Dagger.UnionScope(scopes)
287+
end
288+
Dagger.scope_key_precedence(::Val{:intel_gpus}) = 1
289+
290+
const DEVICES = Dict{Int, ZeDevice}()
291+
const DRIVERS = Dict{Int, ZeDriver}()
292+
const CONTEXTS = Dict{Int, ZeContext}()
293+
294+
function __init__()
295+
if oneAPI.functional()
296+
for (device_id, dev) in enumerate(oneAPI.devices())
297+
@debug "Registering Intel GPU processor with Dagger: $dev"
298+
Dagger.add_processor_callback!("zearray_device_$(device_id)") do
299+
proc = oneArrayDeviceProc(myid(), device_id)
300+
DEVICES[device_id] = dev
301+
driver!(dev.driver)
302+
DRIVERS[device_id] = dev.driver
303+
device!(dev)
304+
ctx = ZeContext(dev.driver)
305+
CONTEXTS[device_id] = ctx
306+
return proc
307+
end
308+
end
309+
end
310+
end
311+
312+
end # module IntelExt

0 commit comments

Comments
 (0)