JuliaGPU
diff --git a/‎src/GPUArrays.jl
Lines changed: 1 addition & 1 deletion b/‎src/GPUArrays.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/abstract_gpu_interface.jl
Lines changed: 15 additions & 9 deletions b/‎src/abstract_gpu_interface.jl
Lines changed: 15 additions & 9 deletions
diff --git a/‎src/abstractarray.jl
Lines changed: 6 additions & 0 deletions b/‎src/abstractarray.jl
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/base.jl
Lines changed: 1 addition & 3 deletions b/‎src/base.jl
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/blas.jl
Lines changed: 3 additions & 2 deletions b/‎src/blas.jl
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/broadcast.jl
Lines changed: 20 additions & 1 deletion b/‎src/broadcast.jl
Lines changed: 20 additions & 1 deletion
diff --git a/‎src/construction.jl
Lines changed: 1 addition & 5 deletions b/‎src/construction.jl
Lines changed: 1 addition & 5 deletions
diff --git a/‎src/jlbackend.jl
Lines changed: 45 additions & 17 deletions b/‎src/jlbackend.jl
Lines changed: 45 additions & 17 deletions
diff --git a/‎src/linalg.jl
Lines changed: 39 additions & 39 deletions b/‎src/linalg.jl
Lines changed: 39 additions & 39 deletions
@@ -16,7 +16,7 @@ include("vectors.jl")
 include("testsuite/testsuite.jl")
 include("jlbackend.jl")
 
-export GPUArray, gpu_call, thread_blocks_heuristic
+export GPUArray, gpu_call, thread_blocks_heuristic, global_size
 export linear_index, @linearidx, @cartesianidx
 
 end # module
@@ -2,11 +2,14 @@
 Abstraction over the GPU thread indexing functions.
 Uses CUDA like names
 =#
-for f in (:blockidx, :blockdim, :threadidx), sym in (:x, :y, :z)
-    fname = Symbol(string(f, '_', sym))
-    @eval $fname(state)::Cuint = error("Not implemented")
-    @eval export $fname
+for sym in (:x, :y, :z)
+    for f in (:blockidx, :blockdim, :threadidx, :griddim)
+        fname = Symbol(string(f, '_', sym))
+        @eval $fname(state)::Cuint = error("Not implemented")
+        @eval export $fname
+    end
 end
+
 """
 in CUDA terms `__synchronize`
 """
@@ -15,11 +18,14 @@ function synchronize_threads(state)
 end
 
 """
-linear index in a GPU kernel
+linear index in a GPU kernel (equal to  OpenCL.get_global_id)
 """
 @inline function linear_index(state)
     Cuint((blockidx_x(state) - Cuint(1)) * blockdim_x(state) + threadidx_x(state))
 end
+@inline function global_size(state)
+    griddim_x(state) * blockdim_x(state)
+end
 
 """
 Blocks until all operations are finished on `A`
@@ -36,10 +42,10 @@ function device(A::GPUArray)
     # makes it easier to write generic code that also works for AbstractArrays
 end
 
-
-@inline function synchronize_threads(state)
-    CUDAnative.__syncthreads()
-end
+# 
+# @inline function synchronize_threads(state)
+#     CUDAnative.__syncthreads()
+# end
 
 macro linearidx(A, statesym = :state)
     quote
 
@@ -234,3 +234,9 @@ function _reshape(A::GPUArray{T}, dims::Dims) where T
     prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
     return unsafe_reinterpret(T, A, dims)
 end
+#ambig
+function _reshape(A::GPUArray{T, 1}, dims::Tuple{Int}) where T
+    n = Base._length(A)
+    prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims"))
+    return unsafe_reinterpret(T, A, dims)
+end
@@ -1,8 +1,6 @@
 import Base: count, map!, permutedims!, cat_t, vcat, hcat
 using Base: @pure
 
-count(pred, A::GPUArray) = Int(mapreduce(pred, +, Cuint(0), A))
-
 allequal(x) = true
 allequal(x, y, z...) = x == y && allequal(y, z...)
 function map!(f, y::GPUArray, xs::GPUArray...)
@@ -74,7 +72,7 @@ end
     (ind-l*indnext+f, _ind2sub(Base.tail(inds), indnext)...)
 end
 
-@pure function gpu_sub2ind{N, T}(dims::NTuple{N}, I::NTuple{N, T})
+@pure function gpu_sub2ind{N, N2, T}(dims::NTuple{N}, I::NTuple{N2, T})
     Base.@_inline_meta
     _sub2ind(NTuple{N, T}(dims), T(1), T(1), I...)
 end
 
@@ -39,6 +39,7 @@ for elty in (Float64, Float32)
     end
 end
 
+Base.scale!(s::Real, X::GPUArray) = scale!(X, s)
 function Base.scale!(X::GPUArray{T}, s::Real) where T <: BLAS.BlasComplex
     R = typeof(real(zero(T)))
     buff = reinterpret(R, vec(X))
@@ -81,8 +82,8 @@ for elty in (Float32, Float64, Complex64, Complex128)
             if length(x) != length(y)
                 throw(DimensionMismatch("x has length $(length(x)), but y has length $(length(y))"))
             end
-            blasmod = blas_module(A)
-            blasmod.axpy!($elty(alpha), blasbuffer(dx), blasbuffer(dx))
+            blasmod = blas_module(x)
+            blasmod.axpy!($elty(alpha), blasbuffer(vec(x)), blasbuffer(vec(y)))
             y
         end
     end
 
@@ -27,6 +27,19 @@ function broadcast!(f::typeof(identity), A::GPUArray, val::Number)
     gpu_call(const_kernel2, A, (A, valconv, Cuint(length(A))))
     A
 end
+@inline function broadcast_t(f, T::Type{Bool}, shape, it, A::GPUArrays.GPUArray, Bs::Vararg{Any,N}) where N
+    C = similar(A, T, shape)
+    keeps, Idefaults = map_newindexer(shape, A, Bs)
+    _broadcast!(f, C, keeps, Idefaults, A, Bs, Val{N}, it)
+    return C
+end
+@inline function broadcast_t(f, T::Type{Bool}, shape, it, A::GPUArrays.GPUArray, B::GPUArrays.GPUArray, Bs::Vararg{Any,N}) where N
+    C = similar(A, T, shape)
+    Bs = (B, Bs...)
+    keeps, Idefaults = map_newindexer(shape, A, Bs)
+    _broadcast!(f, C, keeps, Idefaults, A, Bs, Val{N}, it)
+    return C
+end
 
 @inline function broadcast_t(
         f, ::Type{T}, shape, iter, A::GPUArray, Bs::Vararg{Any,N}
@@ -195,7 +208,13 @@ end
 @pure newindex(I, ilin, keep::Tuple{}, Idefault::Tuple{}, size::Tuple{}) = Cuint(1)
 
 # optimize for 1D arrays
-@pure newindex(I::NTuple{1}, ilin, keep::NTuple{1}, Idefault, size) = ilin
+@pure function newindex(I::NTuple{1}, ilin, keep::NTuple{1}, Idefault, size)
+    if Bool(keep[1])
+        return ilin
+    else
+        return Idefault[1]
+    end
+end
 
 # differently shaped arrays
 @generated function newindex{N, T}(I, ilin::T, keep::NTuple{N}, Idefault, size)
 
@@ -1,4 +1,4 @@
-import Base: fill!, rand, similar, eye, zeros, fill
+import Base: fill!, similar, eye, zeros, fill
 
 
 function fill(X::Type{<: GPUArray}, val, dims::Integer...)
@@ -32,10 +32,6 @@ function eye(T::Type{<: GPUArray}, dims::NTuple{2, Integer})
     res
 end
 
-function rand{T <: GPUArray, ET}(::Type{T}, ::Type{ET}, size...)
-    T(rand(ET, size...))
-end
-
 (T::Type{<: GPUArray})(dims::Integer...) = T(dims)
 (T::Type{<: GPUArray{X} where X})(dims::NTuple{N, Integer}) where N = similar(T, eltype(T), dims)
 
 
@@ -7,12 +7,25 @@ struct JLArray{T, N} <: GPUArray{T, N}
     size::NTuple{N, Int}
 end
 
+"""
+Thread group local memory
+"""
+immutable LocalMem{N, T}
+    x::NTuple{N, Vector{T}}
+end
+
 size(x::JLArray) = x.size
 pointer(x::JLArray) = pointer(x.data)
-to_device(x::JLArray) = x.data
-to_device(x::Tuple) = to_device.(x)
-to_device(x::RefValue{<: JLArray}) = RefValue(to_device(x[]))
-to_device(x) = x
+to_device(state, x::JLArray) = x.data
+to_device(state, x::Tuple) = to_device.(state, x)
+to_device(state, x::RefValue{<: JLArray}) = RefValue(to_device(state, x[]))
+to_device(state, x) = x
+# creates a `local` vector for each thread group
+to_device(state, x::LocalMemory{T}) where T = LocalMem(ntuple(i-> Vector{T}(x.size), blockdim_x(state)))
+
+to_blocks(state, x) = x
+# unpacks local memory for each block
+to_blocks(state, x::LocalMem) = x.x[blockidx_x(state)]
 
 function (::Type{JLArray{T, N}})(size::NTuple{N, Integer}) where {T, N}
     JLArray{T, N}(Array{T, N}(size), size)
@@ -47,31 +60,35 @@ end
 
 mutable struct JLState{N}
     blockdim::NTuple{N, Int}
-    threads::NTuple{N, Int}
+    griddim::NTuple{N, Int}
 
     blockidx::NTuple{N, Int}
     threadidx::NTuple{N, Int}
 end
 
-
 function gpu_call(f, A::JLArray, args::Tuple, blocks = nothing, threads = C_NULL)
     if blocks == nothing
         blocks, threads = thread_blocks_heuristic(length(A))
     elseif isa(blocks, Integer)
         blocks = (blocks,)
-        if threads == C_NULL
-            threads = (1,)
-        end
+    end
+    if threads == C_NULL
+        threads = (1,)
     end
     idx = ntuple(i-> 1, length(blocks))
     blockdim = ceil.(Int, blocks ./ threads)
-    state = JLState(threads, threads, idx, idx)
-    device_args = to_device.(args)
+    state = JLState(threads, blockdim, idx, idx)
+    device_args = to_device.(state, args)
+    tasks = Vector{Task}(threads...)
     for blockidx in CartesianRange(blockdim)
         state.blockidx = blockidx.I
+        block_args = to_blocks.(state, device_args)
         for threadidx in CartesianRange(threads)
-            state.threadidx = threadidx.I
-            f(state, device_args...)
+            thread_state = JLState(state.blockdim, state.griddim, state.blockidx, threadidx.I)
+            tasks[threadidx] = @async f(thread_state, block_args...)
+        end
+        for t in tasks
+            wait(t)
         end
     end
     return
@@ -83,11 +100,22 @@ device(x::JLArray) = JLDevice()
 threads(dev::JLDevice) = 256
 
 
-@inline synchronize_threads(::JLState) = nothing
+@inline function synchronize_threads(::JLState)
+    #=
+    All threads are getting started asynchronously,so a yield will
+    yield to the next execution of the same function, which should call yield
+    at the exact same point in the program, leading to a chain of yields  effectively syncing
+    the tasks (threads).
+    =#
+    yield()
+    return
+end
 
-for f in (:blockidx, :blockdim, :threadidx), (i, sym) in enumerate((:x, :y, :z))
-    fname = Symbol(string(f, '_', sym))
-    @eval $fname(state::JLState) = Cuint(state.$f[$i])
+for (i, sym) in enumerate((:x, :y, :z))
+    for f in (:blockidx, :blockdim, :threadidx, :griddim)
+        fname = Symbol(string(f, '_', sym))
+        @eval $fname(state::JLState) = Cuint(state.$f[$i])
+    end
 end
 
 blas_module(::JLArray) = Base.LinAlg.BLAS
 
@@ -1,48 +1,48 @@
-# function transpose_kernel!(
-#         state, At, A, width, height, A_local, ::Val{BLOCK}
-#     ) where BLOCK
-#
-#     ui1 = UInt32(1)
-#     bidx_x = blockidx_x(state) - ui1
-#     bidx_y = blockidx_y(state) - ui1
-#     tidx_x = threadidx_x(state) - ui1
-#     tidx_y = threadidx_y(state) - ui1
-#
-#     base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
-#     base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
-#
-#     glob_idx_a = base_idx_a + tidx_x + width * tidx_y
-#     glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
-#
-#     A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
-#
-#     cli.barrier(cli.CLK_LOCAL_MEM_FENCE)
-#     At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
-#     return
-# end
-#
-# function max_block_size(dev, h::Int, w::Int)
-#     dim1, dim2 = GPUArrays.blocks(dev)[1:2]
-#     wgsize = GPUArrays.threads(dev)
-#     wglimit = floor(Int, sqrt(wgsize))
-#     return gcd(dim1, dim2, h, w, wglimit)
-# end
-#
-# function Base.transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
-#     dev = GPUArrays.device(A)
-#     block_size = max_block_size(dev, size(A)...)
-#     outsize = UInt32.(size(At))
-#     lmem = GPUArrays.LocalMemory{T}(block_size * (block_size + 1))
-#     args = (At, A, outsize..., lmem, Val{block_size}())
-#     gpu_call(transpose_kernel!, At, args, (block_size, block_size))
-#     At
-# end
+function transpose_kernel!(
+        state, At, A, width, height, A_local, ::Val{BLOCK}
+    ) where BLOCK
+
+    ui1 = UInt32(1)
+    bidx_x = blockidx_x(state) - ui1
+    bidx_y = blockidx_y(state) - ui1
+    tidx_x = threadidx_x(state) - ui1
+    tidx_y = threadidx_y(state) - ui1
+
+    base_idx_a = bidx_x * BLOCK + bidx_y * (BLOCK * width)
+    base_idx_a_t = bidx_y * BLOCK + bidx_x * (BLOCK * height)
+
+    glob_idx_a = base_idx_a + tidx_x + width * tidx_y
+    glob_idx_a_t = base_idx_a_t + tidx_x + height * tidx_y
+
+    A_local[tidx_y * BLOCK + tidx_x + ui1] = A[glob_idx_a + ui1]
+    synchronize_threads(state)
+    At[glob_idx_a_t + ui1] = A_local[tidx_x * BLOCK + tidx_y + ui1]
+    return
+end
+
+function max_block_size(dev, h::Int, w::Int)
+    dim1, dim2 = GPUArrays.blocks(dev)[1:2]
+    wgsize = GPUArrays.threads(dev)
+    wglimit = floor(Int, sqrt(wgsize))
+    return gcd(dim1, dim2, h, w, wglimit)
+end
+
+function Base.transpose!{T}(At::GPUArray{T, 2}, A::GPUArray{T, 2})
+    dev = GPUArrays.device(A)
+    block_size = max_block_size(dev, size(A)...)
+    outsize = UInt32.(size(At))
+    lmem = GPUArrays.LocalMemory{T}(block_size * (block_size + 1))
+    args = (At, A, outsize..., lmem, Val{block_size}())
+    gpu_call(transpose_kernel!, At, args, (block_size, block_size))
+    At
+end
 
 function genperm(I::NTuple{N}, perm::NTuple{N}) where N
     ntuple(d-> I[perm[d]], Val{N})
 end
 
 function Base.permutedims!(dest::GPUArray, src::GPUArray, perm)
+    perm = Cuint.((perm...,))
     gpu_call(dest, (dest, src, perm)) do state, dest, src, perm
         I = @cartesianidx dest state
         @inbounds dest[I...] = src[genperm(I, perm)...]