Skip to content

Commit a101a4c

Browse files
committed
make things CuArrays compatible
1 parent 6062180 commit a101a4c

17 files changed

+245
-110
lines changed

src/GPUArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module GPUArrays
33

44
include("abstractarray.jl")
55
include("abstract_gpu_interface.jl")
6+
include("ondevice.jl")
67
include("base.jl")
78
include("construction.jl")
89
include("blas.jl")

src/abstract_gpu_interface.jl

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,7 @@ for sym in (:x, :y, :z)
1010
end
1111
end
1212

13-
"""
14-
Creates a block local array pointer with `T` being the element type
15-
and `N` the length. Both T and N need to be static!
16-
"""
17-
function LocalMemory(state, T, N)
18-
error("Not implemented")
19-
end
13+
2014

2115
"""
2216
in CUDA terms `__synchronize`
@@ -73,13 +67,44 @@ end
7367

7468

7569
"""
70+
Calls function `f` on the GPU.
7671
`A` must be an GPUArray and will help to dispatch to the correct GPU backend
7772
and supplies queues and contexts.
7873
Calls kernel with `kernel(state, args...)`, where state is dependant on the backend
7974
and can be used for e.g getting an index into A with `linear_index(state)`.
80-
Optionally, number of blocks threads can be specified.
81-
Falls back to some heuristic dependant on the size of `A`
75+
Optionally, lunch configuration can be supplied in the following way:
76+
77+
1) A single integer, indicating how many work items (total number of threads) you want to lunch.
78+
in this case `linear_index(state)` will be a number in the range 1:configuration
79+
2) Pass a tuple of integer tuples to define blocks and threads per blocks!
80+
8281
"""
83-
function gpu_call(kernel, A::AbstractArray, args::Tuple, blocks = length(A), threads = nothing)
84-
kernel(args...)
82+
function gpu_call(f, A::GPUArray, args::Tuple, configuration = length(A))
83+
ITuple = NTuple{N, Integer} where N
84+
# If is a single integer, we assume it to be the global size / total number of threads one wants to launch
85+
thread_blocks = if isa(configuration, Integer)
86+
thread_blocks_heuristic(configuration)
87+
elseif isa(configuration, ITuple)
88+
# if a single integer ntuple, we assume it to configure the blocks
89+
configuration, ntuple(x-> x == 1 ? 256 : 1, length(configuration))
90+
elseif isa(configuration, Tuple{ITuple, ITuple})
91+
# 2 dim tuple of ints == blocks + threads per block
92+
if any(x-> length(x) > 3 || length(x) < 1, configuration)
93+
error("blocks & threads must be 1-3 dimensional. Found: $configuration")
94+
end
95+
map(x-> Int.(x), configuration) # make sure it all has the same int type
96+
else
97+
error("""Please lunch a gpu kernel with a valid configuration.
98+
Found: $configurations
99+
Configuration needs to be:
100+
1) A single integer, indicating how many work items (total number of threads) you want to lunch.
101+
in this case `linear_index(state)` will be a number in the range 1:configuration
102+
2) Pass a tuple of integer tuples to define blocks and threads per blocks!
103+
`linear_index` will be inbetween 1:prod((blocks..., threads...))
104+
""")
105+
end
106+
_gpu_call(f, A, args, thread_blocks)
85107
end
108+
109+
# Internal GPU call function, that needs to be overloaded by the backends.
110+
_gpu_call(f, A, args, thread_blocks) = error("Not implemented")

src/broadcast.jl

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
using Base.Broadcast
22
import Base.Broadcast: broadcast!, _broadcast!, broadcast_t
33
using Base.Broadcast: map_newindexer
4-
using Base: @propagate_inbounds, @pure
4+
using Base: @propagate_inbounds, @pure, RefValue
55

66
@inline function const_kernel(state, A, op, len)
77
idx = linear_index(state)
@@ -72,17 +72,21 @@ function broadcast_t(f::Any, ::Type{Any}, ::Any, ::Any, A::GPUArrays.GPUArray, a
7272
error("Return type couldn't be inferred for broadcast. Func: $f, $(typeof(A)), $args")
7373
end
7474

75+
deref(x) = x
76+
deref(x::RefValue) = (x[],) # RefValue doesn't work with CUDAnative
77+
7578
function _broadcast!(
7679
func, out::GPUArray,
7780
keeps::K, Idefaults::ID,
7881
A::AT, Bs::BT, ::Type{Val{N}}, unused2 # we don't need those arguments
7982
) where {N, K, ID, AT, BT}
83+
8084
shape = Cuint.(size(out))
8185
args = (A, Bs...)
8286
descriptor_tuple = ntuple(length(args)) do i
8387
BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
8488
end
85-
gpu_call(broadcast_kernel!, out, (func, out, shape, Cuint(length(out)), descriptor_tuple, A, Bs...))
89+
gpu_call(broadcast_kernel!, out, (func, out, shape, Cuint(length(out)), descriptor_tuple, A, deref.(Bs)...))
8690
out
8791
end
8892

@@ -95,7 +99,7 @@ function Base.foreach(func, over::GPUArray, Bs...)
9599
descriptor_tuple = ntuple(length(args)) do i
96100
BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
97101
end
98-
gpu_call(foreach_kernel, over, (func, shape, Cuint.(length(over)), descriptor_tuple, over, Bs...))
102+
gpu_call(foreach_kernel, over, (func, shape, Cuint.(length(over)), descriptor_tuple, over, deref.(Bs)...))
99103
return
100104
end
101105

@@ -111,6 +115,9 @@ immutable BroadcastDescriptorN{Typ, N} <: BroadcastDescriptor{Typ}
111115
keep::NTuple{N, Cuint}
112116
idefault::NTuple{N, Cuint}
113117
end
118+
function BroadcastDescriptor(val::RefValue, keep, idefault)
119+
BroadcastDescriptorN{Tuple, 1}((Cuint(1),), (Cuint(0),), (Cuint(1),))
120+
end
114121

115122
function BroadcastDescriptor(val, keep, idefault)
116123
N = length(keep)

src/construction.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import Base: fill!, similar, eye, zeros, fill
1+
import Base: fill!, similar, eye, zeros, ones, fill
22

33

44
function fill(X::Type{<: GPUArray}, val, dims::Integer...)
@@ -16,10 +16,12 @@ function fill!{T, N}(A::GPUArray{T, N}, val)
1616
end
1717

1818
zeros(T::Type{<: GPUArray}, dims::NTuple{N, Integer}) where N = fill(T, zero(eltype(T)), dims)
19+
ones(T::Type{<: GPUArray}, dims::NTuple{N, Integer}) where N = fill(T, one(eltype(T)), dims)
1920

2021
function eyekernel(state, res::AbstractArray{T}, stride) where T
2122
i = linear_index(state)
22-
ilin = (stride * (i - 1)) + i
23+
i > stride && return
24+
ilin = (stride * (i - Cuint(1))) + i
2325
@inbounds res[ilin] = one(T)
2426
return
2527
end

src/heuristics.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
function thread_blocks_heuristic(len::Integer)
2+
# TODO better threads default
23
threads = min(len, 256)
34
blocks = ceil(Int, len / threads)
4-
blocks = blocks * threads
55
(blocks,), (threads,)
66
end

src/jlbackend.jl

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ mutable struct JLState{N}
6565
blockidx::NTuple{N, Int}
6666
threadidx::NTuple{N, Int}
6767
localmem_counter::Int
68-
localmems::Vector{Vector{Vector}}
68+
localmems::Vector{Vector{Array}}
6969
end
7070

7171
function JLState(threads::NTuple{N}, blockdim::NTuple{N}) where N
7272
idx = ntuple(i-> 1, Val{N})
7373
blockcount = prod(blockdim)
74-
lmems = [Vector{Vector}(0) for i in 1:blockcount]
74+
lmems = [Vector{Array}(0) for i in 1:blockcount]
7575
JLState{N}(threads, blockdim, idx, idx, 0, lmems)
7676
end
7777

@@ -86,37 +86,38 @@ function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
8686
)
8787
end
8888

89-
function LocalMemory(state::JLState, T, N)
89+
function LocalMemory(state::JLState, ::Type{T}, ::Val{N}, ::Val{C}) where {T, N, C}
9090
state.localmem_counter += 1
9191
lmems = state.localmems[blockidx_x(state)]
9292
# first invokation in block
9393
if length(lmems) < state.localmem_counter
94-
lmem = zeros(T, N)
94+
lmem = zeros(T, N...)
9595
push!(lmems, lmem)
9696
return lmem
9797
else
9898
return lmems[state.localmem_counter]
9999
end
100100
end
101101

102-
function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL)
103-
if blocks == nothing
104-
blocks, threads = thread_blocks_heuristic(length(A))
105-
elseif isa(blocks, Integer)
106-
blocks = (blocks,)
107-
end
108-
if threads == C_NULL
109-
threads = map(x-> 1, blocks)
110-
end
102+
function (::Type{AbstractDeviceArray})(ptr::Array, shape::NTuple{N, Integer}) where N
103+
reshape(ptr, Int.(shape))
104+
end
105+
function (::Type{AbstractDeviceArray})(ptr::Array, shape::Vararg{Integer, N}) where N
106+
reshape(ptr, Int.(shape))
107+
end
108+
109+
110+
function _gpu_call(f, A::JLArray, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
111+
blocks, threads = blocks_threads
111112
idx = ntuple(i-> 1, length(blocks))
112-
blockdim = ceil.(Int, blocks ./ threads)
113+
blockdim = blocks
113114
state = JLState(threads, blockdim)
114115
device_args = to_device.(state, args)
115-
tasks = Vector{Task}(prod(threads))
116+
tasks = Array{Task}(threads...)
116117
for blockidx in CartesianRange(blockdim)
117118
state.blockidx = blockidx.I
118119
block_args = to_blocks.(state, device_args)
119-
for threadidx in CartesianRange(threads)
120+
for threadidx in CartesianRange(Int.(threads))
120121
thread_state = JLState(state, threadidx.I)
121122
tasks[threadidx] = @async f(thread_state, block_args...)
122123
end

src/linalg.jl

Lines changed: 63 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,87 @@
1-
function transpose_kernel!(
2-
state, At, A::AbstractArray{T}, width, height, ::Val{BLOCK}, ::Val{LMem}
3-
) where {BLOCK, LMem, T}
1+
import Base: transpose!, permutedims!
2+
# function transpose_kernel!(
3+
# state, At, A::AbstractArray{T}, width, height, ::Val{BLOCK}, ::Val{LMem}
4+
# ) where {BLOCK, LMem, T}
5+
#
6+
# ui1 = UInt32(1)
7+
# bidx_x = blockidx_x(state) - ui1
8+
# bidx_y = blockidx_y(state) - ui1
9+
# tidx_x = threadidx_x(state) - ui1
10+
# tidx_y = threadidx_y(state) - ui1
11+
#
12+
# A_local = @LocalMemory(state, T, LMem)
13+
#
14+
# base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
15+
# base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
16+
#
17+
# glob_idx_a = base_idx_a + tidx_x + width * tidx_y
18+
# glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
19+
# glob_idx_a >= length(A) && return
20+
# A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
21+
# synchronize_threads(state)
22+
# At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
23+
# return
24+
# end
425

5-
ui1 = UInt32(1)
26+
function transpose_blocks!(
27+
state, odata::AbstractArray{T}, idata, ::Val{SHMEM}, ::Val{TDIM}, ::Val{BLOCK_ROWS}, ::Val{NROW}
28+
) where {T, SHMEM, TDIM, BLOCK_ROWS, NROW}
29+
30+
ui1 = Cuint(1)
31+
tile = @LocalMemory(state, T, SHMEM)
632
bidx_x = blockidx_x(state) - ui1
733
bidx_y = blockidx_y(state) - ui1
834
tidx_x = threadidx_x(state) - ui1
935
tidx_y = threadidx_y(state) - ui1
1036

11-
A_local = LocalMemory(state, T, LMem)
37+
x = bidx_x * TDIM + tidx_x + ui1
38+
y = bidx_y * TDIM + tidx_y + ui1
39+
dims = size(idata)
1240

13-
base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
14-
base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
41+
(x <= dims[2] && (y + (BLOCK_ROWS * Cuint(3))) <= dims[1]) || return
1542

16-
glob_idx_a = base_idx_a + tidx_x + width * tidx_y
17-
glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
43+
for j = Cuint(0):Cuint(3)
44+
j0 = j * BLOCK_ROWS
45+
tile[tidx_x + ui1, tidx_y + j0 + ui1] = idata[y + j0, x]
46+
end
1847

19-
A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
2048
synchronize_threads(state)
21-
At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
49+
for j = Cuint(0):Cuint(3)
50+
j0 = j * BLOCK_ROWS
51+
odata[x, y + j0] = tile[tidx_x + ui1, tidx_y + j0 + ui1]
52+
end
2253
return
2354
end
2455

25-
function max_block_size(dev, h::Int, w::Int)
26-
dim1, dim2 = GPUArrays.blocks(dev)[1:2]
27-
wgsize = GPUArrays.threads(dev)
28-
wglimit = floor(Int, sqrt(wgsize))
29-
return gcd(dim1, dim2, h, w, wglimit)
30-
end
56+
function transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
57+
if size(A, 1) == size(A, 2) && all(x-> x % 32 == 0, size(A))
58+
outsize = UInt32.(size(At))
59+
TDIM = Cuint(32); BLOCK_ROWS = Cuint(8)
60+
nrows = TDIM ÷ BLOCK_ROWS
61+
shmemdim = (TDIM, (TDIM + Cuint(1)))
62+
static_params = map(x-> Val{x}(), (shmemdim, TDIM, BLOCK_ROWS, nrows))
63+
args = (At, A, static_params...)
3164

32-
function Base.transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
33-
dev = GPUArrays.device(A)
34-
block_size = max_block_size(dev, size(A)...)
35-
outsize = UInt32.(size(At))
36-
lmem = block_size * (block_size + 1)
37-
args = (At, A, outsize..., Val{block_size}(), Val{lmem}())
38-
gpu_call(transpose_kernel!, At, args, (block_size, block_size))
65+
griddim = ceil.(Int, size(A) ./ (TDIM, TDIM))
66+
blockdim = (TDIM, BLOCK_ROWS)
67+
# optimized version for 32x & square dimensions
68+
gpu_call(transpose_blocks!, At, args, (griddim, blockdim))
69+
else
70+
# simple fallback
71+
gpu_call(At, (At, A)) do state, At, A
72+
idx = @cartesianidx A state
73+
@inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
74+
return
75+
end
76+
end
3977
At
4078
end
4179

4280
function genperm(I::NTuple{N}, perm::NTuple{N}) where N
4381
ntuple(d-> I[perm[d]], Val{N})
4482
end
4583

46-
function Base.permutedims!(dest::GPUArray, src::GPUArray, perm)
84+
function permutedims!(dest::GPUArray, src::GPUArray, perm)
4785
perm = Cuint.((perm...,))
4886
gpu_call(dest, (dest, src, perm)) do state, dest, src, perm
4987
I = @cartesianidx dest state

src/mapreduce.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
import Base: any, count, countnz
2+
13
#############################
24
# reduce
35
# functions in base implemented with a direct loop need to be overloaded to use mapreduce
46
any(pred, A::GPUArray) = Bool(mapreduce(pred, |, Cint(0), (u)))
57
count(pred, A::GPUArray) = Int(mapreduce(pred, +, Cuint(0), A))
6-
Base.countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, Cuint(0), A))
7-
Base.countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, Cuint(0), A, dim))
8+
countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, Cuint(0), A))
9+
countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, Cuint(0), A, dim))
810

911

1012
# hack to get around of fetching the first element of the GPUArray
@@ -79,7 +81,7 @@ for i = 0:10
7981
# http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
8082
function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
8183
ui0 = Cuint(0); ui1 = Cuint(1); ui2 = Cuint(2)
82-
tmp_local = LocalMemory(state, T, LMEM)
84+
tmp_local = @LocalMemory(state, T, LMEM)
8385
global_index = linear_index(state)
8486
acc = v0
8587
# # Loop sequentially over chunks of input vector
@@ -128,6 +130,6 @@ function acc_mapreduce{T, OT, N}(
128130
out = similar(A, OT, (blocksize,))
129131
fill!(out, v0)
130132
args = (f, op, v0, A, Val{threads}(), out, rest...)
131-
gpu_call(reduce_kernel, A, args, (blocksize * threads,), (threads,))
133+
gpu_call(reduce_kernel, A, args, ((blocksize,), (threads,)))
132134
reduce(op, Array(out))
133135
end

0 commit comments

Comments
 (0)