add proper local memory

SimonDanisch · SimonDanisch · commit 6d18ada1ba57 · 2017-09-19T13:32:09.000+02:00
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -13,10 +13,11 @@ include("indexing.jl")
 include("linalg.jl")
 include("mapreduce.jl")
 include("vectors.jl")
+include("convolution.jl")
 include("testsuite/testsuite.jl")
 include("jlbackend.jl")
 
 export GPUArray, gpu_call, thread_blocks_heuristic, global_size
-export linear_index, @linearidx, @cartesianidx
+export linear_index, @linearidx, @cartesianidx, convolution!
 
 end # module
diff --git a/src/abstract_gpu_interface.jl b/src/abstract_gpu_interface.jl
@@ -10,6 +10,14 @@ for sym in (:x, :y, :z)
     end
 end
 
+"""
+Creates a block local array pointer with `T` being the element type
+and `N` the length. Both T and N need to be static!
+"""
+function LocalMemory(state, T, N)
+    error("Not implemented")
+end
+
 """
 in CUDA terms `__synchronize`
 """
@@ -42,7 +50,7 @@ function device(A::GPUArray)
     # makes it easier to write generic code that also works for AbstractArrays
 end
 
-# 
+#
 # @inline function synchronize_threads(state)
 #     CUDAnative.__syncthreads()
 # end
diff --git a/src/construction.jl b/src/construction.jl
@@ -40,7 +40,11 @@ similar(x::X, ::Type{T}, size::Base.Dims{N}) where {X <: GPUArray, T, N} = simil
 function convert(AT::Type{<: GPUArray{T, N}}, A::DenseArray{T, N}) where {T, N}
     copy!(AT(Base.size(A)), A)
 end
-function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2, N}) where {T1, T2, N}
+function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2}) where {T1, T2}
+    copy!(similar(AT, T1, size(A)), T1.(A))
+end
+using Colors
+function convert(AT::Type{<: GPUArray{T1}}, A::DenseArray{T2}) where {T1 <: Colorant, T2 <: Colorant}
     copy!(similar(AT, T1, size(A)), T1.(A))
 end
 function convert(AT::Type{<: GPUArray}, A::DenseArray{T2, N}) where {T2, N}
diff --git a/src/convolution.jl b/src/convolution.jl
@@ -1,53 +1,54 @@
-function convolution_kernel(
-        imgSrc::AbstractArray{T},
-        kernelValues,
-        kernelSize,
-        imgConvolved
-    ) where T
-
-    w = kernelSize[1]
-    wBy2 = w >> 1; #w divided by 2
-    #Goes up to 15x15 filters
-    p = LocalMemory(T, BLOCK_SIZE + 14, BLOCK_SIZE + 14) #Identification of this workgroup
-    i = get_group_id(0);
-    j = get_group_id(1); #Identification of work-item
-    idX = get_local_id(0);
-    idY = get_local_id(1);
-
-    ii = i*BLOCK_SIZE + idX; # == get_global_id(0);
-    jj = j*BLOCK_SIZE + idY; # == get_global_id(1);
-    coords = (ii, jj)
-    #Reads pixels
-    P[idX][idY] = imgSrc[gpu_ind2sub(sizeA, (ii, jj))]
-    #Needs to read extra elements for the filter in the borders
-    if (idX < w)
-        P[idX + BLOCK_SIZE][idY] = imgSrc[gpu_ind2sub(sizeA, (ii + BLOCK_SIZE, jj))]
-    end
-    if (idY < w)
-        P[idX][idY + BLOCK_SIZE] = imgSrc[gpu_ind2sub(sizeA, (ii, jj + BLOCK_SIZE))]
-    end
-    barrier(CLK_LOCAL_MEM_FENCE)
-    ##############
-    float4 convPix = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
-    float4 temp;
-    for (int ix = 0; ix < w; ix++)
-        for (int jy = 0; jy < w; jy++)
-            temp = (float4)((float)P[ix][jy].x,
-                            (float)P[ix][jy].y,
-                            (float)P[ix][jy].z,
-                            (float)P[ix][jy].w);
-            convPix += temp * kernelValues[ix + w*jy];
-        end
-    end
-    ##############
-    barrier(CLK_LOCAL_MEM_FENCE);
-    imgConvolved[ii+wBy2, jj+wBy2] = P[idX+wBy2][idY+wBy2]
-end
+# function convolution_kernel(
+#         state,
+#         imgSrc::AbstractArray{T},
+#         kernelValues,
+#         kernel_width,
+#         imgConvolved,
+#         ::Val{BLOCK_SIZE},
+#         ::Val{LOCAL_WIDTH}
+#     ) where {T, BLOCK_SIZE, LOCAL_WIDTH}
+#     ui1 = Cuint(1); ui0 = Cuint(0)
+#     w = kernel_width
+#     wBy2 = w >> ui1 #w divided by 2
+#     #Goes up to 15x15 filters
+#     ptr = LocalMemory(state, T, LOCAL_WIDTH) # local width need to be static, so calculating it from block size won't cut it
+#     P = CLArrays.LocalArray{T, 2}(ptr, (LOCAL_WIDTH, LOCAL_WIDTH))
+#
+#     i = blockidx_x(state)
+#     j = blockidx_y(state) #Identification of work-item
+#     idX = threadidx_x(state)
+#     idY = threadidx_y(state)
+#
+#     ii = i*BLOCK_SIZE + idX; # == get_global_id(0);
+#     jj = j*BLOCK_SIZE + idY; # == get_global_id(1);
+#     #Reads pixels
+#     P[idX, idY] = imgSrc[ii, jj]
+#     #Needs to read extra elements for the filter in the borders
+#     if (idX < w)
+#         P[idX + BLOCK_SIZE, idY] = imgSrc[ii + BLOCK_SIZE, jj]
+#     end
+#     if (idY < w)
+#         P[idX, idY + BLOCK_SIZE] = imgSrc[ii, jj + BLOCK_SIZE]
+#     end
+#     synchronize_threads(state)
+#     ##############
+#     convPix = zero(T);
+#     for ix = ui0:(w - ui1)
+#         for jy = ui0:(w - ui1)
+#             temp = P[ix, jy]
+#             convPix += temp * kernelValues[ix + w*jy]
+#         end
+#     end
+#     ##############
+#     synchronize_threads(state)
+#     imgConvolved[ii + wBy2, jj + wBy2] = P[idX + wBy2, idY + wBy2]
+#     return
+# end
 
 
 function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) where T
     ilin = linear_index(state)
-    idx = gpu_ind2sub(Asize, ilin)
+    idx = GPUArrays.gpu_ind2sub(Asize, ilin)
     if idx[1] >= Asize[1] - Ksize[1] || idx[2] >= Asize[2] - Ksize[2]
         return
     end
@@ -64,8 +65,24 @@ function convolution_kernel(state, A::AbstractArray{T}, out, K, Asize, Ksize) wh
 end
 
 
-function conv!(a, out, k)
+function convolution!(a, out, k)
     gpu_call(convolution_kernel, a, (a, out, k, Cuint.(size(a)), Cuint.(size(k))))
     GPUArrays.synchronize(out)
     out
 end
+
+immutable FFTKernel{T}
+    kernel::T
+    irfftplan
+    rfftplan
+end
+
+function fftkernel(A, kernel)
+    plan_rfft!(A)
+
+end
+
+function convolution_fft!(a, out, k)
+    irfft(rfft(A).*conj(rfft(krn)), length(indices(A,1)))
+    out
+end
diff --git a/src/jlbackend.jl b/src/jlbackend.jl
@@ -64,6 +64,39 @@ mutable struct JLState{N}
 
     blockidx::NTuple{N, Int}
     threadidx::NTuple{N, Int}
+    localmem_counter::Int
+    localmems::Vector{Vector{Vector}}
+end
+
+function JLState(threads::NTuple{N}, blockdim::NTuple{N}) where N
+    idx = ntuple(i-> 1, Val{N})
+    blockcount = prod(blockdim)
+    lmems = [Vector{Vector}(0) for i in 1:blockcount]
+    JLState{N}(threads, blockdim, idx, idx, 0, lmems)
+end
+
+function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
+    JLState{N}(
+        state.blockdim,
+        state.griddim,
+        state.blockidx,
+        threadidx,
+        0,
+        state.localmems
+    )
+end
+
+function LocalMemory(state::JLState, T, N)
+    state.localmem_counter += 1
+    lmems = state.localmems[blockidx_x(state)]
+    # first invokation in block
+    if length(lmems) < state.localmem_counter
+        lmem = zeros(T, N)
+        push!(lmems, lmem)
+        return lmem
+    else
+        return lmems[state.localmem_counter]
+    end
 end
 
 function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL)
@@ -77,14 +110,14 @@ function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL
     end
     idx = ntuple(i-> 1, length(blocks))
     blockdim = ceil.(Int, blocks ./ threads)
-    state = JLState(threads, blockdim, idx, idx)
+    state = JLState(threads, blockdim)
     device_args = to_device.(state, args)
     tasks = Vector{Task}(threads...)
     for blockidx in CartesianRange(blockdim)
         state.blockidx = blockidx.I
         block_args = to_blocks.(state, device_args)
         for threadidx in CartesianRange(threads)
-            thread_state = JLState(state.blockdim, state.griddim, state.blockidx, threadidx.I)
+            thread_state = JLState(state, threadidx.I)
             tasks[threadidx] = @async f(thread_state, block_args...)
         end
         for t in tasks
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -76,8 +76,9 @@ for i = 0:10
     fargs = ntuple(x-> :(broadcast_index($(args[x]), length, global_index)), i)
     @eval begin
         # http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
-        function reduce_kernel(state, f, op, v0, A, tmp_local, result, $(args...))
+        function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
             ui0 = Cuint(0); ui1 = Cuint(1); ui2 = Cuint(2)
+            tmp_local = LocalMemory(state, T, LMEM)
             global_index = linear_index(state)
             acc = v0
             # # Loop sequentially over chunks of input vector
@@ -125,8 +126,7 @@ function acc_mapreduce{T, OT, N}(
     end
     out = similar(A, OT, (blocksize,))
     fill!(out, v0)
-    lmem = LocalMemory{OT}(threads)
-    args = (f, op, v0, A, lmem, out, rest...)
+    args = (f, op, v0, A, Val{threads}(), out, rest...)
     gpu_call(reduce_kernel, A, args, (blocksize * threads,), (threads,))
     reduce(op, Array(out))
 end
diff --git a/test/convolution.jl b/test/convolution.jl
@@ -1,10 +1,18 @@
-img = RGB{Float32}.(load(homedir()*"/test.jpg"));
+using GPUArrays, Colors, FileIO, ImageFiltering
+using CLArrays
+using GPUArrays: synchronize_threads
+import GPUArrays: LocalMemory
+using CLArrays
 
-a = GPUArray(img);
+
+img = RGB{Float32}.(load(homedir()*"/Desktop/backround.jpg"));
+
+a = CLArray(img);
 out = similar(a);
-k = GPUArray(Float32.(collect(Kernel.gaussian(3))));
+k = CLArray(Float32.(collect(Kernel.gaussian(3))));
 imgc = similar(img)
-@btime conv!($a, $out, $k);
-@btime
-@which imfilter!(imgc, img, (Kernel.gaussian(3)))
-Array(out)
+
+# convolution!(a, out, k);
+# Array(out)
+# outc = similar(img)
+# copy!(outc, out)