|
| 1 | +module POCLKernels |
| 2 | + |
| 3 | +using ..POCL |
| 4 | +using ..POCL: @device_override, SPIRVIntrinsics, cl |
| 5 | +using ..POCL: device |
| 6 | + |
| 7 | +import KernelAbstractions as KA |
| 8 | + |
| 9 | +import StaticArrays |
| 10 | + |
| 11 | +import Adapt |
| 12 | + |
| 13 | + |
| 14 | +## Back-end Definition |
| 15 | + |
| 16 | +export POCLBackend |
| 17 | + |
| 18 | +struct POCLBackend <: KA.GPU |
| 19 | +end |
| 20 | + |
| 21 | +# KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims) |
| 22 | +# KA.zeros(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims) |
| 23 | +# KA.ones(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims) |
| 24 | + |
| 25 | +# KA.get_backend(::CLArray) = POCLBackend() |
| 26 | +# KA.synchronize(::POCLBackend) = cl.finish(cl.queue()) |
| 27 | +# KA.supports_float64(::POCLBackend) = false # XXX: this is platform/device dependent |
| 28 | + |
| 29 | +# Adapt.adapt_storage(::POCLBackend, a::Array) = Adapt.adapt(CLArray, a) |
| 30 | +# Adapt.adapt_storage(::POCLBackend, a::CLArray) = a |
| 31 | +# Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a) |
| 32 | + |
| 33 | + |
| 34 | +## Memory Operations |
| 35 | + |
| 36 | +# function KA.copyto!(::POCLBackend, A, B) |
| 37 | +# copyto!(A, B) |
| 38 | +# # TODO: Address device to host copies in jl being synchronizing |
| 39 | +# end |
| 40 | + |
| 41 | + |
| 42 | +## Kernel Launch |
| 43 | + |
| 44 | +function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, _ndrange, iterspace) |
| 45 | + KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace) |
| 46 | +end |
| 47 | +function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, I, _ndrange, iterspace, |
| 48 | + ::Dynamic) where Dynamic |
| 49 | + KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace) |
| 50 | +end |
| 51 | + |
| 52 | +function KA.launch_config(kernel::KA.Kernel{POCLBackend}, ndrange, workgroupsize) |
| 53 | + if ndrange isa Integer |
| 54 | + ndrange = (ndrange,) |
| 55 | + end |
| 56 | + if workgroupsize isa Integer |
| 57 | + workgroupsize = (workgroupsize, ) |
| 58 | + end |
| 59 | + |
| 60 | + # partition checked that the ndrange's agreed |
| 61 | + if KA.ndrange(kernel) <: KA.StaticSize |
| 62 | + ndrange = nothing |
| 63 | + end |
| 64 | + |
| 65 | + iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize && |
| 66 | + workgroupsize === nothing |
| 67 | + # use ndrange as preliminary workgroupsize for autotuning |
| 68 | + KA.partition(kernel, ndrange, ndrange) |
| 69 | + else |
| 70 | + KA.partition(kernel, ndrange, workgroupsize) |
| 71 | + end |
| 72 | + |
| 73 | + return ndrange, workgroupsize, iterspace, dynamic |
| 74 | +end |
| 75 | + |
| 76 | +function threads_to_workgroupsize(threads, ndrange) |
| 77 | + total = 1 |
| 78 | + return map(ndrange) do n |
| 79 | + x = min(div(threads, total), n) |
| 80 | + total *= x |
| 81 | + return x |
| 82 | + end |
| 83 | +end |
| 84 | + |
| 85 | +function (obj::KA.Kernel{POCLBackend})(args...; ndrange=nothing, workgroupsize=nothing) |
| 86 | + ndrange, workgroupsize, iterspace, dynamic = |
| 87 | + KA.launch_config(obj, ndrange, workgroupsize) |
| 88 | + |
| 89 | + # this might not be the final context, since we may tune the workgroupsize |
| 90 | + ctx = KA.mkcontext(obj, ndrange, iterspace) |
| 91 | + kernel = @opencl launch=false obj.f(ctx, args...) |
| 92 | + |
| 93 | + # figure out the optimal workgroupsize automatically |
| 94 | + if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing |
| 95 | + wg_info = cl.work_group_info(kernel.fun, device()) |
| 96 | + wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange) |
| 97 | + iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd) |
| 98 | + ctx = KA.mkcontext(obj, ndrange, iterspace) |
| 99 | + end |
| 100 | + |
| 101 | + groups = length(KA.blocks(iterspace)) |
| 102 | + items = length(KA.workitems(iterspace)) |
| 103 | + |
| 104 | + if groups == 0 |
| 105 | + return nothing |
| 106 | + end |
| 107 | + |
| 108 | + # Launch kernel |
| 109 | + global_size = groups * items |
| 110 | + local_size = items |
| 111 | + kernel(ctx, args...; global_size, local_size) |
| 112 | + |
| 113 | + return nothing |
| 114 | +end |
| 115 | + |
| 116 | + |
| 117 | +## Indexing Functions |
| 118 | + |
| 119 | +@device_override @inline function KA.__index_Local_Linear(ctx) |
| 120 | + return get_local_id(1) |
| 121 | +end |
| 122 | + |
| 123 | +@device_override @inline function KA.__index_Group_Linear(ctx) |
| 124 | + return get_group_id(1) |
| 125 | +end |
| 126 | + |
| 127 | +@device_override @inline function KA.__index_Global_Linear(ctx) |
| 128 | + return get_global_id(1) |
| 129 | +end |
| 130 | + |
| 131 | +@device_override @inline function KA.__index_Local_Cartesian(ctx) |
| 132 | + @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] |
| 133 | +end |
| 134 | + |
| 135 | +@device_override @inline function KA.__index_Group_Cartesian(ctx) |
| 136 | + @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] |
| 137 | +end |
| 138 | + |
| 139 | +@device_override @inline function KA.__index_Global_Cartesian(ctx) |
| 140 | + return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) |
| 141 | +end |
| 142 | + |
| 143 | +@device_override @inline function KA.__validindex(ctx) |
| 144 | + if KA.__dynamic_checkbounds(ctx) |
| 145 | + I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) |
| 146 | + return I in KA.__ndrange(ctx) |
| 147 | + else |
| 148 | + return true |
| 149 | + end |
| 150 | +end |
| 151 | + |
| 152 | + |
| 153 | +## Shared and Scratch Memory |
| 154 | + |
| 155 | +@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} |
| 156 | + ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims))) |
| 157 | + CLDeviceArray(Dims, ptr) |
| 158 | +end |
| 159 | + |
| 160 | +@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} |
| 161 | + StaticArrays.MArray{KA.__size(Dims), T}(undef) |
| 162 | +end |
| 163 | + |
| 164 | + |
| 165 | +## Synchronization and Printing |
| 166 | + |
| 167 | +@device_override @inline function KA.__synchronize() |
| 168 | + barrier() |
| 169 | +end |
| 170 | + |
| 171 | +@device_override @inline function KA.__print(args...) |
| 172 | + SPIRVIntrinsics._print(args...) |
| 173 | +end |
| 174 | + |
| 175 | + |
| 176 | +## Other |
| 177 | + |
| 178 | +KA.argconvert(::KA.Kernel{POCLBackend}, arg) = clconvert(arg) |
| 179 | + |
| 180 | +end |
0 commit comments