JuliaGPU
diff --git a/‎src/GPUArrays.jl
Lines changed: 1 addition & 0 deletions b/‎src/GPUArrays.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/abstract_gpu_interface.jl
Lines changed: 36 additions & 11 deletions b/‎src/abstract_gpu_interface.jl
Lines changed: 36 additions & 11 deletions
diff --git a/‎src/broadcast.jl
Lines changed: 10 additions & 3 deletions b/‎src/broadcast.jl
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/construction.jl
Lines changed: 4 additions & 2 deletions b/‎src/construction.jl
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/heuristics.jl
Lines changed: 1 addition & 1 deletion b/‎src/heuristics.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/jlbackend.jl
Lines changed: 17 additions & 16 deletions b/‎src/jlbackend.jl
Lines changed: 17 additions & 16 deletions
diff --git a/‎src/linalg.jl
Lines changed: 63 additions & 25 deletions b/‎src/linalg.jl
Lines changed: 63 additions & 25 deletions
diff --git a/‎src/mapreduce.jl
Lines changed: 6 additions & 4 deletions b/‎src/mapreduce.jl
Lines changed: 6 additions & 4 deletions
@@ -3,6 +3,7 @@ module GPUArrays
 
 include("abstractarray.jl")
 include("abstract_gpu_interface.jl")
+include("ondevice.jl")
 include("base.jl")
 include("construction.jl")
 include("blas.jl")
 
@@ -10,13 +10,7 @@ for sym in (:x, :y, :z)
     end
 end
 
-"""
-Creates a block local array pointer with `T` being the element type
-and `N` the length. Both T and N need to be static!
-"""
-function LocalMemory(state, T, N)
-    error("Not implemented")
-end
+
 
 """
 in CUDA terms `__synchronize`
@@ -73,13 +67,44 @@ end
 
 
 """
+Calls function `f` on the GPU.
 `A` must be an GPUArray and will help to dispatch to the correct GPU backend
 and supplies queues and contexts.
 Calls kernel with `kernel(state, args...)`, where state is dependant on the backend
 and can be used for e.g getting an index into A with `linear_index(state)`.
-Optionally, number of blocks threads can be specified.
-Falls back to some heuristic dependant on the size of `A`
+Optionally, lunch configuration can be supplied in the following way:
+
+    1) A single integer, indicating how many work items (total number of threads) you want to lunch.
+        in this case `linear_index(state)` will be a number in the range 1:configuration
+    2) Pass a tuple of integer tuples to define blocks and threads per blocks!
+
 """
-function gpu_call(kernel, A::AbstractArray, args::Tuple, blocks = length(A), threads = nothing)
-    kernel(args...)
+function gpu_call(f, A::GPUArray, args::Tuple, configuration = length(A))
+    ITuple = NTuple{N, Integer} where N
+    # If is a single integer, we assume it to be the global size / total number of threads one wants to launch
+    thread_blocks = if isa(configuration, Integer)
+        thread_blocks_heuristic(configuration)
+    elseif isa(configuration, ITuple)
+        # if a single integer ntuple, we assume it to configure the blocks
+        configuration,  ntuple(x-> x == 1 ? 256 : 1, length(configuration))
+    elseif isa(configuration, Tuple{ITuple, ITuple})
+        # 2 dim tuple of ints == blocks + threads per block
+        if any(x-> length(x) > 3 || length(x) < 1, configuration)
+            error("blocks & threads must be 1-3 dimensional. Found: $configuration")
+        end
+        map(x-> Int.(x), configuration) # make sure it all has the same int type
+    else
+        error("""Please lunch a gpu kernel with a valid configuration.
+            Found: $configurations
+            Configuration needs to be:
+            1) A single integer, indicating how many work items (total number of threads) you want to lunch.
+                in this case `linear_index(state)` will be a number in the range 1:configuration
+            2) Pass a tuple of integer tuples to define blocks and threads per blocks!
+                `linear_index` will be inbetween 1:prod((blocks..., threads...))
+        """)
+    end
+    _gpu_call(f, A, args, thread_blocks)
 end
+
+# Internal GPU call function, that needs to be overloaded by the backends.
+_gpu_call(f, A, args, thread_blocks) = error("Not implemented")
@@ -1,7 +1,7 @@
 using Base.Broadcast
 import Base.Broadcast: broadcast!, _broadcast!, broadcast_t
 using Base.Broadcast: map_newindexer
-using Base: @propagate_inbounds, @pure
+using Base: @propagate_inbounds, @pure, RefValue
 
 @inline function const_kernel(state, A, op, len)
     idx = linear_index(state)
@@ -72,17 +72,21 @@ function broadcast_t(f::Any, ::Type{Any}, ::Any, ::Any, A::GPUArrays.GPUArray, a
     error("Return type couldn't be inferred for broadcast. Func: $f, $(typeof(A)), $args")
 end
 
+deref(x) = x
+deref(x::RefValue) = (x[],) # RefValue doesn't work with CUDAnative
+
 function _broadcast!(
         func, out::GPUArray,
         keeps::K, Idefaults::ID,
         A::AT, Bs::BT, ::Type{Val{N}}, unused2 # we don't need those arguments
     ) where {N, K, ID, AT, BT}
+
     shape = Cuint.(size(out))
     args = (A, Bs...)
     descriptor_tuple = ntuple(length(args)) do i
         BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
     end
-    gpu_call(broadcast_kernel!, out, (func, out, shape, Cuint(length(out)), descriptor_tuple, A, Bs...))
+    gpu_call(broadcast_kernel!, out, (func, out, shape, Cuint(length(out)), descriptor_tuple, A,  deref.(Bs)...))
     out
 end
 
@@ -95,7 +99,7 @@ function Base.foreach(func, over::GPUArray, Bs...)
     descriptor_tuple = ntuple(length(args)) do i
         BroadcastDescriptor(args[i], keeps[i], Idefaults[i])
     end
-    gpu_call(foreach_kernel, over, (func, shape, Cuint.(length(over)), descriptor_tuple, over, Bs...))
+    gpu_call(foreach_kernel, over, (func, shape, Cuint.(length(over)), descriptor_tuple, over, deref.(Bs)...))
     return
 end
 
@@ -111,6 +115,9 @@ immutable BroadcastDescriptorN{Typ, N} <: BroadcastDescriptor{Typ}
     keep::NTuple{N, Cuint}
     idefault::NTuple{N, Cuint}
 end
+function BroadcastDescriptor(val::RefValue, keep, idefault)
+    BroadcastDescriptorN{Tuple, 1}((Cuint(1),), (Cuint(0),), (Cuint(1),))
+end
 
 function BroadcastDescriptor(val, keep, idefault)
     N = length(keep)
 
@@ -1,4 +1,4 @@
-import Base: fill!, similar, eye, zeros, fill
+import Base: fill!, similar, eye, zeros, ones, fill
 
 
 function fill(X::Type{<: GPUArray}, val, dims::Integer...)
@@ -16,10 +16,12 @@ function fill!{T, N}(A::GPUArray{T, N}, val)
 end
 
 zeros(T::Type{<: GPUArray}, dims::NTuple{N, Integer}) where N = fill(T, zero(eltype(T)), dims)
+ones(T::Type{<: GPUArray}, dims::NTuple{N, Integer}) where N = fill(T, one(eltype(T)), dims)
 
 function eyekernel(state, res::AbstractArray{T}, stride) where T
     i = linear_index(state)
-    ilin = (stride * (i - 1)) + i
+    i > stride && return
+    ilin = (stride * (i - Cuint(1))) + i
     @inbounds res[ilin] = one(T)
     return
 end
 
@@ -1,6 +1,6 @@
 function thread_blocks_heuristic(len::Integer)
+    # TODO better threads default
     threads = min(len, 256)
     blocks = ceil(Int, len / threads)
-    blocks = blocks * threads
     (blocks,), (threads,)
 end
@@ -65,13 +65,13 @@ mutable struct JLState{N}
     blockidx::NTuple{N, Int}
     threadidx::NTuple{N, Int}
     localmem_counter::Int
-    localmems::Vector{Vector{Vector}}
+    localmems::Vector{Vector{Array}}
 end
 
 function JLState(threads::NTuple{N}, blockdim::NTuple{N}) where N
     idx = ntuple(i-> 1, Val{N})
     blockcount = prod(blockdim)
-    lmems = [Vector{Vector}(0) for i in 1:blockcount]
+    lmems = [Vector{Array}(0) for i in 1:blockcount]
     JLState{N}(threads, blockdim, idx, idx, 0, lmems)
 end
 
@@ -86,37 +86,38 @@ function JLState(state::JLState{N}, threadidx::NTuple{N}) where N
     )
 end
 
-function LocalMemory(state::JLState, T, N)
+function LocalMemory(state::JLState, ::Type{T}, ::Val{N}, ::Val{C}) where {T, N, C}
     state.localmem_counter += 1
     lmems = state.localmems[blockidx_x(state)]
     # first invokation in block
     if length(lmems) < state.localmem_counter
-        lmem = zeros(T, N)
+        lmem = zeros(T, N...)
         push!(lmems, lmem)
         return lmem
     else
         return lmems[state.localmem_counter]
     end
 end
 
-function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL)
-    if blocks == nothing
-        blocks, threads = thread_blocks_heuristic(length(A))
-    elseif isa(blocks, Integer)
-        blocks = (blocks,)
-    end
-    if threads == C_NULL
-        threads = map(x-> 1, blocks)
-    end
+function (::Type{AbstractDeviceArray})(ptr::Array, shape::NTuple{N, Integer}) where N
+    reshape(ptr, Int.(shape))
+end
+function (::Type{AbstractDeviceArray})(ptr::Array, shape::Vararg{Integer, N}) where N
+    reshape(ptr, Int.(shape))
+end
+
+
+function _gpu_call(f, A::JLArray, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
+    blocks, threads = blocks_threads
     idx = ntuple(i-> 1, length(blocks))
-    blockdim = ceil.(Int, blocks ./ threads)
+    blockdim = blocks
     state = JLState(threads, blockdim)
     device_args = to_device.(state, args)
-    tasks = Vector{Task}(prod(threads))
+    tasks = Array{Task}(threads...)
     for blockidx in CartesianRange(blockdim)
         state.blockidx = blockidx.I
         block_args = to_blocks.(state, device_args)
-        for threadidx in CartesianRange(threads)
+        for threadidx in CartesianRange(Int.(threads))
             thread_state = JLState(state, threadidx.I)
             tasks[threadidx] = @async f(thread_state, block_args...)
         end
 
@@ -1,49 +1,87 @@
-function transpose_kernel!(
-        state, At, A::AbstractArray{T}, width, height, ::Val{BLOCK}, ::Val{LMem}
-    ) where {BLOCK, LMem, T}
+import Base: transpose!, permutedims!
+# function transpose_kernel!(
+#         state, At, A::AbstractArray{T}, width, height, ::Val{BLOCK}, ::Val{LMem}
+#     ) where {BLOCK, LMem, T}
+#
+#     ui1 = UInt32(1)
+#     bidx_x = blockidx_x(state) - ui1
+#     bidx_y = blockidx_y(state) - ui1
+#     tidx_x = threadidx_x(state) - ui1
+#     tidx_y = threadidx_y(state) - ui1
+#
+#     A_local = @LocalMemory(state, T, LMem)
+#
+#     base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
+#     base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
+#
+#     glob_idx_a = base_idx_a + tidx_x + width * tidx_y
+#     glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
+#     glob_idx_a >= length(A) && return
+#     A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
+#     synchronize_threads(state)
+#     At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
+#     return
+# end
 
-    ui1 = UInt32(1)
+function transpose_blocks!(
+        state, odata::AbstractArray{T}, idata, ::Val{SHMEM}, ::Val{TDIM}, ::Val{BLOCK_ROWS}, ::Val{NROW}
+    ) where {T, SHMEM, TDIM, BLOCK_ROWS, NROW}
+
+    ui1 = Cuint(1)
+    tile = @LocalMemory(state, T, SHMEM)
     bidx_x = blockidx_x(state) - ui1
     bidx_y = blockidx_y(state) - ui1
     tidx_x = threadidx_x(state) - ui1
     tidx_y = threadidx_y(state) - ui1
 
-    A_local = LocalMemory(state, T, LMem)
+    x = bidx_x * TDIM + tidx_x + ui1
+    y = bidx_y * TDIM + tidx_y + ui1
+    dims = size(idata)
 
-    base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
-    base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
+    (x <= dims[2] && (y + (BLOCK_ROWS * Cuint(3))) <= dims[1]) || return
 
-    glob_idx_a = base_idx_a + tidx_x + width * tidx_y
-    glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
+    for j = Cuint(0):Cuint(3)
+        j0 = j * BLOCK_ROWS
+        tile[tidx_x + ui1, tidx_y + j0 + ui1] = idata[y + j0, x]
+    end
 
-    A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
     synchronize_threads(state)
-    At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
+    for j = Cuint(0):Cuint(3)
+        j0 = j * BLOCK_ROWS
+        odata[x, y + j0] = tile[tidx_x + ui1, tidx_y + j0 + ui1]
+    end
     return
 end
 
-function max_block_size(dev, h::Int, w::Int)
-    dim1, dim2 = GPUArrays.blocks(dev)[1:2]
-    wgsize = GPUArrays.threads(dev)
-    wglimit = floor(Int, sqrt(wgsize))
-    return gcd(dim1, dim2, h, w, wglimit)
-end
+function transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
+    if size(A, 1) == size(A, 2) && all(x-> x % 32 == 0, size(A))
+        outsize = UInt32.(size(At))
+        TDIM = Cuint(32); BLOCK_ROWS = Cuint(8)
+        nrows = TDIM ÷ BLOCK_ROWS
+        shmemdim = (TDIM, (TDIM + Cuint(1)))
+        static_params = map(x-> Val{x}(), (shmemdim, TDIM, BLOCK_ROWS, nrows))
+        args = (At, A, static_params...)
 
-function Base.transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
-    dev = GPUArrays.device(A)
-    block_size = max_block_size(dev, size(A)...)
-    outsize = UInt32.(size(At))
-    lmem = block_size * (block_size + 1)
-    args = (At, A, outsize..., Val{block_size}(), Val{lmem}())
-    gpu_call(transpose_kernel!, At, args, (block_size, block_size))
+        griddim = ceil.(Int, size(A) ./ (TDIM, TDIM))
+        blockdim = (TDIM, BLOCK_ROWS)
+        # optimized version for 32x & square dimensions
+        gpu_call(transpose_blocks!, At, args, (griddim, blockdim))
+    else
+        # simple fallback
+        gpu_call(At, (At, A)) do state, At, A
+            idx = @cartesianidx A state
+            @inbounds At[idx[2], idx[1]] = A[idx[1], idx[2]]
+            return
+        end
+    end
     At
 end
 
 function genperm(I::NTuple{N}, perm::NTuple{N}) where N
     ntuple(d-> I[perm[d]], Val{N})
 end
 
-function Base.permutedims!(dest::GPUArray, src::GPUArray, perm)
+function permutedims!(dest::GPUArray, src::GPUArray, perm)
     perm = Cuint.((perm...,))
     gpu_call(dest, (dest, src, perm)) do state, dest, src, perm
         I = @cartesianidx dest state
 
@@ -1,10 +1,12 @@
+import Base: any, count, countnz
+
 #############################
 # reduce
 # functions in base implemented with a direct loop need to be overloaded to use mapreduce
 any(pred, A::GPUArray) = Bool(mapreduce(pred, |, Cint(0), (u)))
 count(pred, A::GPUArray) = Int(mapreduce(pred, +, Cuint(0), A))
-Base.countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, Cuint(0), A))
-Base.countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, Cuint(0), A, dim))
+countnz(A::GPUArray) = Int(mapreduce(x-> x != 0, +, Cuint(0), A))
+countnz(A::GPUArray, dim) = Int(mapreducedim(x-> x != 0, +, Cuint(0), A, dim))
 
 
 # hack to get around of fetching the first element of the GPUArray
@@ -79,7 +81,7 @@ for i = 0:10
         # http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
         function reduce_kernel(state, f, op, v0::T, A, ::Val{LMEM}, result, $(args...)) where {T, LMEM}
             ui0 = Cuint(0); ui1 = Cuint(1); ui2 = Cuint(2)
-            tmp_local = LocalMemory(state, T, LMEM)
+            tmp_local = @LocalMemory(state, T, LMEM)
             global_index = linear_index(state)
             acc = v0
             # # Loop sequentially over chunks of input vector
@@ -128,6 +130,6 @@ function acc_mapreduce{T, OT, N}(
     out = similar(A, OT, (blocksize,))
     fill!(out, v0)
     args = (f, op, v0, A, Val{threads}(), out, rest...)
-    gpu_call(reduce_kernel, A, args, (blocksize * threads,), (threads,))
+    gpu_call(reduce_kernel, A, args, ((blocksize,), (threads,)))
     reduce(op, Array(out))
 end