Skip to content

Commit 6d18ada

Browse files
committed
add proper local memory
1 parent 2ceb49f commit 6d18ada

File tree

7 files changed

+133
-62
lines changed

7 files changed

+133
-62
lines changed

src/GPUArrays.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@ include("indexing.jl")
1313
include("linalg.jl")
1414
include("mapreduce.jl")
1515
include("vectors.jl")
16+
include("convolution.jl")
1617
include("testsuite/testsuite.jl")
1718
include("jlbackend.jl")
1819

1920
export GPUArray, gpu_call, thread_blocks_heuristic, global_size
20-
export linear_index, @linearidx, @cartesianidx
21+
export linear_index, @linearidx, @cartesianidx, convolution!
2122

2223
end # module

src/abstract_gpu_interface.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ for sym in (:x, :y, :z)
1010
end
1111
end
1212

13+
"""
14+
Creates a block local array pointer with `T` being the element type
15+
and `N` the length. Both T and N need to be static!
16+
"""
17+
function LocalMemory(state, T, N)
18+
error("Not implemented")
19+
end
20+
1321
"""
1422
in CUDA terms `__synchronize`
1523
"""
@@ -42,7 +50,7 @@ function device(A::GPUArray)
4250
# makes it easier to write generic code that also works for AbstractArrays
4351
end
4452

45-
#
53+
#
4654
# @inline function synchronize_threads(state)
4755
# CUDAnative.__syncthreads()
4856
# end

src/construction.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@ similar(x::X, ::Type{T}, size::Base.Dims{N}) where {X <: GPUArray, T, N} = simil
4040
function convert(AT::Type{<: GPUArray{T, N}}, A::DenseArray{T, N}) where {T, N}
4141
copy!(AT(Base.size(A)), A)
4242
end
43-
function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2, N}) where {T1, T2, N}
43+
function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2}) where {T1, T2}
44+
copy!(similar(AT, T1, size(A)), T1.(A))
45+
end
46+
using Colors
47+
function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2}) where {T1 <: Colorant, T2 <: Colorant}
4448
copy!(similar(AT, T1, size(A)), T1.(A))
4549
end
4650
function convert(AT::Type{<: GPUArray}, A::DenseArray{T2, N}) where {T2, N}

src/convolution.jl

Lines changed: 64 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,54 @@
1-
function convolution_kernel(
2-
imgSrc::AbstractArray{T},
3-
kernelValues,
4-
kernelSize,
5-
imgConvolved
6-
) where T
7-
8-
w = kernelSize[1]
9-
wBy2 = w >> 1; #w divided by 2
10-
#Goes up to 15x15 filters
11-
p = LocalMemory(T, BLOCK_SIZE + 14, BLOCK_SIZE + 14) #Identification of this workgroup
12-
i = get_group_id(0);
13-
j = get_group_id(1); #Identification of work-item
14-
idX = get_local_id(0);
15-
idY = get_local_id(1);
16-
17-
ii = i*BLOCK_SIZE + idX; # == get_global_id(0);
18-
jj = j*BLOCK_SIZE + idY; # == get_global_id(1);
19-
coords = (ii, jj)
20-
#Reads pixels
21-
P[idX][idY] = imgSrc[gpu_ind2sub(sizeA, (ii, jj))]
22-
#Needs to read extra elements for the filter in the borders
23-
if (idX < w)
24-
P[idX + BLOCK_SIZE][idY] = imgSrc[gpu_ind2sub(sizeA, (ii + BLOCK_SIZE, jj))]
25-
end
26-
if (idY < w)
27-
P[idX][idY + BLOCK_SIZE] = imgSrc[gpu_ind2sub(sizeA, (ii, jj + BLOCK_SIZE))]
28-
end
29-
barrier(CLK_LOCAL_MEM_FENCE)
30-
##############
31-
float4 convPix = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
32-
float4 temp;
33-
for (int ix = 0; ix < w; ix++)
34-
for (int jy = 0; jy < w; jy++)
35-
temp = (float4)((float)P[ix][jy].x,
36-
(float)P[ix][jy].y,
37-
(float)P[ix][jy].z,
38-
(float)P[ix][jy].w);
39-
convPix += temp * kernelValues[ix + w*jy];
40-
end
41-
end
42-
##############
43-
barrier(CLK_LOCAL_MEM_FENCE);
44-
imgConvolved[ii+wBy2, jj+wBy2] = P[idX+wBy2][idY+wBy2]
45-
end
1+
# function convolution_kernel(
2+
# state,
3+
# imgSrc::AbstractArray{T},
4+
# kernelValues,
5+
# kernel_width,
6+
# imgConvolved,
7+
# ::Val{BLOCK_SIZE},
8+
# ::Val{LOCAL_WIDTH}
9+
# ) where {T, BLOCK_SIZE, LOCAL_WIDTH}
10+
# ui1 = Cuint(1); ui0 = Cuint(0)
11+
# w = kernel_width
12+
# wBy2 = w >> ui1 #w divided by 2
13+
# #Goes up to 15x15 filters
14+
# ptr = LocalMemory(state, T, LOCAL_WIDTH) # local width need to be static, so calculating it from block size won't cut it
15+
# P = CLArrays.LocalArray{T, 2}(ptr, (LOCAL_WIDTH, LOCAL_WIDTH))
16+
#
17+
# i = blockidx_x(state)
18+
# j = blockidx_y(state) #Identification of work-item
19+
# idX = threadidx_x(state)
20+
# idY = threadidx_y(state)
21+
#
22+
# ii = i*BLOCK_SIZE + idX; # == get_global_id(0);
23+
# jj = j*BLOCK_SIZE + idY; # == get_global_id(1);
24+
# #Reads pixels
25+
# P[idX, idY] = imgSrc[ii, jj]
26+
# #Needs to read extra elements for the filter in the borders
27+
# if (idX < w)
28+
# P[idX + BLOCK_SIZE, idY] = imgSrc[ii + BLOCK_SIZE, jj]
29+
# end
30+
# if (idY < w)
31+
# P[idX, idY + BLOCK_SIZE] = imgSrc[ii, jj + BLOCK_SIZE]
32+
# end
33+
# synchronize_threads(state)
34+
# ##############
35+
# convPix = zero(T);
36+
# for ix = ui0:(w - ui1)
37+
# for jy = ui0:(w - ui1)
38+
# temp = P[ix, jy]
39+
# convPix += temp * kernelValues[ix + w*jy]
40+
# end
41+
# end
42+
# ##############
43+
# synchronize_threads(state)
44+
# imgConvolved[ii + wBy2, jj + wBy2] = P[idX + wBy2, idY + wBy2]
45+
# return
46+
# end
4647

4748

4849
function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) where T
4950
ilin = linear_index(state)
50-
idx = gpu_ind2sub(Asize, ilin)
51+
idx = GPUArrays.gpu_ind2sub(Asize, ilin)
5152
if idx[1] >= Asize[1] - Ksize[1] || idx[2] >= Asize[2] - Ksize[2]
5253
return
5354
end
@@ -64,8 +65,24 @@ function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) wh
6465
end
6566

6667

67-
function conv!(a, out, k)
68+
function convolution!(a, out, k)
6869
gpu_call(convolution_kernel, a, (a, out, k, Cuint.(size(a)), Cuint.(size(k))))
6970
GPUArrays.synchronize(out)
7071
out
7172
end
73+
74+
immutable FFTKernel{T}
75+
kernel::T
76+
irfftplan
77+
rfftplan
78+
end
79+
80+
function fftkernel(A, kernel)
81+
plan_rfft!(A)
82+
83+
end
84+
85+
function convolution_fft!(a, out, k)
86+
irfft(rfft(A).*conj(rfft(krn)), length(indices(A,1)))
87+
out
88+
end

src/jlbackend.jl

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,39 @@ mutable struct JLState{N}
6464

6565
blockidx::NTuple{N, Int}
6666
threadidx::NTuple{N, Int}
67+
localmem_counter::Int
68+
localmems::Vector{Vector{Vector}}
69+
end
70+
71+
function JLState(threads::NTuple{N}, blockdim::NTuple{N}) where N
72+
idx = ntuple(i-> 1, Val{N})
73+
blockcount = prod(blockdim)
74+
lmems = [Vector{Vector}(0) for i in 1:blockcount]
75+
JLState{N}(threads, blockdim, idx, idx, 0, lmems)
76+
end
77+
78+
function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
79+
JLState{N}(
80+
state.blockdim,
81+
state.griddim,
82+
state.blockidx,
83+
threadidx,
84+
0,
85+
state.localmems
86+
)
87+
end
88+
89+
function LocalMemory(state::JLState, T, N)
90+
state.localmem_counter += 1
91+
lmems = state.localmems[blockidx_x(state)]
92+
# first invokation in block
93+
if length(lmems) < state.localmem_counter
94+
lmem = zeros(T, N)
95+
push!(lmems, lmem)
96+
return lmem
97+
else
98+
return lmems[state.localmem_counter]
99+
end
67100
end
68101

69102
function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL)
@@ -77,14 +110,14 @@ function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL
77110
end
78111
idx = ntuple(i-> 1, length(blocks))
79112
blockdim = ceil.(Int, blocks ./ threads)
80-
state = JLState(threads, blockdim, idx, idx)
113+
state = JLState(threads, blockdim)
81114
device_args = to_device.(state, args)
82115
tasks = Vector{Task}(threads...)
83116
for blockidx in CartesianRange(blockdim)
84117
state.blockidx = blockidx.I
85118
block_args = to_blocks.(state, device_args)
86119
for threadidx in CartesianRange(threads)
87-
thread_state = JLState(state.blockdim, state.griddim, state.blockidx, threadidx.I)
120+
thread_state = JLState(state, threadidx.I)
88121
tasks[threadidx] = @async f(thread_state, block_args...)
89122
end
90123
for t in tasks

src/mapreduce.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,9 @@ for i = 0:10
7676
fargs = ntuple(x-> :(broadcast_index($(args[x]), length, global_index)), i)
7777
@eval begin
7878
# http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
79-
function reduce_kernel(state, f, op, v0, A, tmp_local, result, $(args...))
79+
function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
8080
ui0 = Cuint(0); ui1 = Cuint(1); ui2 = Cuint(2)
81+
tmp_local = LocalMemory(state, T, LMEM)
8182
global_index = linear_index(state)
8283
acc = v0
8384
# # Loop sequentially over chunks of input vector
@@ -125,8 +126,7 @@ function acc_mapreduce{T, OT, N}(
125126
end
126127
out = similar(A, OT, (blocksize,))
127128
fill!(out, v0)
128-
lmem = LocalMemory{OT}(threads)
129-
args = (f, op, v0, A, lmem, out, rest...)
129+
args = (f, op, v0, A, Val{threads}(), out, rest...)
130130
gpu_call(reduce_kernel, A, args, (blocksize * threads,), (threads,))
131131
reduce(op, Array(out))
132132
end

test/convolution.jl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1-
img = RGB{Float32}.(load(homedir()*"/test.jpg"));
1+
using GPUArrays, Colors, FileIO, ImageFiltering
2+
using CLArrays
3+
using GPUArrays: synchronize_threads
4+
import GPUArrays: LocalMemory
5+
using CLArrays
26

3-
a = GPUArray(img);
7+
8+
img = RGB{Float32}.(load(homedir()*"/Desktop/backround.jpg"));
9+
10+
a = CLArray(img);
411
out = similar(a);
5-
k = GPUArray(Float32.(collect(Kernel.gaussian(3))));
12+
k = CLArray(Float32.(collect(Kernel.gaussian(3))));
613
imgc = similar(img)
7-
@btime conv!($a, $out, $k);
8-
@btime
9-
@which imfilter!(imgc, img, (Kernel.gaussian(3)))
10-
Array(out)
14+
15+
# convolution!(a, out, k);
16+
# Array(out)
17+
# outc = similar(img)
18+
# copy!(outc, out)

0 commit comments

Comments
 (0)