Skip to content

Commit d6f478d

Browse files
committed
start setting up backend
1 parent e3ac56b commit d6f478d

File tree

4 files changed

+512
-2
lines changed

4 files changed

+512
-2
lines changed

src/pocl/backend.jl

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
module POCLKernels
2+
3+
using ..POCL
4+
using ..POCL: @device_override, SPIRVIntrinsics, cl
5+
using ..POCL: device
6+
7+
import KernelAbstractions as KA
8+
9+
import StaticArrays
10+
11+
import Adapt
12+
13+
14+
## Back-end Definition
15+
16+
export POCLBackend
17+
18+
struct POCLBackend <: KA.GPU
19+
end
20+
21+
# KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims)
22+
# KA.zeros(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims)
23+
# KA.ones(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)
24+
25+
# KA.get_backend(::CLArray) = POCLBackend()
26+
# KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
27+
# KA.supports_float64(::POCLBackend) = false # XXX: this is platform/device dependent
28+
29+
# Adapt.adapt_storage(::POCLBackend, a::Array) = Adapt.adapt(CLArray, a)
30+
# Adapt.adapt_storage(::POCLBackend, a::CLArray) = a
31+
# Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a)
32+
33+
34+
## Memory Operations
35+
36+
# function KA.copyto!(::POCLBackend, A, B)
37+
# copyto!(A, B)
38+
# # TODO: Address device to host copies in jl being synchronizing
39+
# end
40+
41+
42+
## Kernel Launch
43+
44+
function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, _ndrange, iterspace)
45+
KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
46+
end
47+
function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, I, _ndrange, iterspace,
48+
::Dynamic) where Dynamic
49+
KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
50+
end
51+
52+
function KA.launch_config(kernel::KA.Kernel{POCLBackend}, ndrange, workgroupsize)
53+
if ndrange isa Integer
54+
ndrange = (ndrange,)
55+
end
56+
if workgroupsize isa Integer
57+
workgroupsize = (workgroupsize, )
58+
end
59+
60+
# partition checked that the ndrange's agreed
61+
if KA.ndrange(kernel) <: KA.StaticSize
62+
ndrange = nothing
63+
end
64+
65+
iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
66+
workgroupsize === nothing
67+
# use ndrange as preliminary workgroupsize for autotuning
68+
KA.partition(kernel, ndrange, ndrange)
69+
else
70+
KA.partition(kernel, ndrange, workgroupsize)
71+
end
72+
73+
return ndrange, workgroupsize, iterspace, dynamic
74+
end
75+
76+
function threads_to_workgroupsize(threads, ndrange)
77+
total = 1
78+
return map(ndrange) do n
79+
x = min(div(threads, total), n)
80+
total *= x
81+
return x
82+
end
83+
end
84+
85+
function (obj::KA.Kernel{POCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
86+
ndrange, workgroupsize, iterspace, dynamic =
87+
KA.launch_config(obj, ndrange, workgroupsize)
88+
89+
# this might not be the final context, since we may tune the workgroupsize
90+
ctx = KA.mkcontext(obj, ndrange, iterspace)
91+
kernel = @opencl launch=false obj.f(ctx, args...)
92+
93+
# figure out the optimal workgroupsize automatically
94+
if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
95+
wg_info = cl.work_group_info(kernel.fun, device())
96+
wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
97+
iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
98+
ctx = KA.mkcontext(obj, ndrange, iterspace)
99+
end
100+
101+
groups = length(KA.blocks(iterspace))
102+
items = length(KA.workitems(iterspace))
103+
104+
if groups == 0
105+
return nothing
106+
end
107+
108+
# Launch kernel
109+
global_size = groups * items
110+
local_size = items
111+
kernel(ctx, args...; global_size, local_size)
112+
113+
return nothing
114+
end
115+
116+
117+
## Indexing Functions
118+
119+
@device_override @inline function KA.__index_Local_Linear(ctx)
120+
return get_local_id(1)
121+
end
122+
123+
@device_override @inline function KA.__index_Group_Linear(ctx)
124+
return get_group_id(1)
125+
end
126+
127+
@device_override @inline function KA.__index_Global_Linear(ctx)
128+
return get_global_id(1)
129+
end
130+
131+
@device_override @inline function KA.__index_Local_Cartesian(ctx)
132+
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
133+
end
134+
135+
@device_override @inline function KA.__index_Group_Cartesian(ctx)
136+
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
137+
end
138+
139+
@device_override @inline function KA.__index_Global_Cartesian(ctx)
140+
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
141+
end
142+
143+
@device_override @inline function KA.__validindex(ctx)
144+
if KA.__dynamic_checkbounds(ctx)
145+
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
146+
return I in KA.__ndrange(ctx)
147+
else
148+
return true
149+
end
150+
end
151+
152+
153+
## Shared and Scratch Memory
154+
155+
@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
156+
ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
157+
CLDeviceArray(Dims, ptr)
158+
end
159+
160+
@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
161+
StaticArrays.MArray{KA.__size(Dims), T}(undef)
162+
end
163+
164+
165+
## Synchronization and Printing
166+
167+
@device_override @inline function KA.__synchronize()
168+
barrier()
169+
end
170+
171+
@device_override @inline function KA.__print(args...)
172+
SPIRVIntrinsics._print(args...)
173+
end
174+
175+
176+
## Other
177+
178+
KA.argconvert(::KA.Kernel{POCLBackend}, arg) = clconvert(arg)
179+
180+
end

src/pocl/compiler/execution.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ abstract type AbstractKernel{F,TT} end
147147
args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...)
148148

149149
# filter out ghost arguments that shouldn't be passed
150-
predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
150+
predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt)
151151
to_pass = map(!predicate, sig.parameters)
152152
call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]]
153153
call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]]
@@ -165,7 +165,7 @@ abstract type AbstractKernel{F,TT} end
165165

166166
quote
167167
svm_pointers = Ptr{Cvoid}[]
168-
clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
168+
$cl.clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
169169
end
170170
end
171171

0 commit comments

Comments
 (0)