bug fixes, fixes for reduce

SimonDanisch · SimonDanisch · commit 5fec45885c7c · 2017-09-18T16:58:21.000+02:00
diff --git a/src/CLArrays.jl b/src/CLArrays.jl
@@ -19,6 +19,6 @@ include("compilation.jl")
 include("mapreduce.jl")
 include("3rdparty.jl")
 
-export CLArray
+export CLArray, gpu_call
 
 end # module
diff --git a/src/array.jl b/src/array.jl
@@ -34,6 +34,8 @@ function (::Type{CLArray{T, N}})(size::NTuple{N, Integer}, ctx = global_context(
     CLArray{clT, N}(size, ptr)
 end
 
+raw_print(msg::AbstractString...) =
+    ccall(:write, Cssize_t, (Cint, Cstring, Csize_t), 1, join(msg), length(join(msg)))
 
 similar(::Type{<: CLArray}, ::Type{T}, size::Base.Dims{N}) where {T, N} = CLArray{T, N}(size)
 
@@ -42,6 +44,7 @@ function unsafe_free!(a::CLArray)
     ctxid = context(ptr).id
     if cl.is_ctx_id_alive(ctxid) && ctxid != C_NULL
         Mem.free(ptr)
+        Mem.current_allocated_mem[] -= sizeof(eltype(a)) * length(a)
     end
     #TODO logging that we don't free since context is not alive
 end
diff --git a/src/compilation.jl b/src/compilation.jl
@@ -161,6 +161,15 @@ function assemble_kernel(m::CLMethod)
     kernel_ptrs = []
     body = Expr(:block)
     nargs = method_nargs(m)
+    # declare rest of slots
+    for (i, (T, name)) in enumerate(getslots!(m)[nargs+1:end])
+        slot = TypedSlot(i + nargs, T)
+        push!(m.decls, slot)
+        push!(m, T)
+        tmp = :($name::$T)
+        tmp.typ = T
+        push!(body.args, tmp)
+    end
     st = getslots!(m)[2:nargs] # don't include self
     arg_idx = 1
     ptr_extract = []
@@ -196,6 +205,7 @@ function assemble_kernel(m::CLMethod)
             push!(kernel_args, :($argslot::$T))
         end
     end
+
     append!(kernel_args, kernel_ptrs)
     real_body = _getast(m)
     body.typ = real_body.typ # use real type
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
@@ -1,9 +1,9 @@
 using Transpiler.cli: get_local_id, get_global_id, barrier,  CLK_LOCAL_MEM_FENCE
-using Transpiler.cli: get_local_size, get_global_size, get_group_id
-import GPUArrays: synchronize, synchronize_threads, device
+using Transpiler.cli: get_local_size, get_global_size, get_group_id, get_num_groups
+import GPUArrays: synchronize, synchronize_threads, device, global_size, linear_index
 #synchronize
 function synchronize(x::CLArray)
-    cl.finish(context(x).queue) # TODO figure out the diverse ways of synchronization
+    cl.finish(global_queue(x)) # TODO figure out the diverse ways of synchronization
 end
 
 
@@ -12,12 +12,14 @@ immutable KernelState
     KernelState() = new(Int32(0))
 end
 
-for (f, fcl, isidx) in (
-        (:blockidx, get_group_id, true),
-        (:blockdim, get_local_size, false),
-        (:threadidx, get_local_id, true)
-    )
-    for (i, sym) in enumerate((:x, :y, :z))
+for (i, sym) in enumerate((:x, :y, :z))
+    for (f, fcl, isidx) in (
+            (:blockidx, get_group_id, true),
+            (:blockdim, get_local_size, false),
+            (:threadidx, get_local_id, true),
+            (:griddim, get_num_groups, false)
+        )
+
         fname = Symbol(string(f, '_', sym))
         if isidx
             @eval GPUArrays.$fname(::KernelState)::Cuint = $fcl($(i-1)) + Cuint(1)
@@ -27,4 +29,8 @@ for (f, fcl, isidx) in (
     end
 end
 
+global_size(state::KernelState) = get_global_size(0)
+linear_index(state::KernelState) = get_global_id(0) + Cuint(1)
+
+
 synchronize_threads(::KernelState) = cli.barrier(CLK_LOCAL_MEM_FENCE)
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -1,56 +0,0 @@
-import GPUArrays: acc_mapreduce
-using Transpiler.cli: get_local_id, get_global_id, barrier, CLK_LOCAL_MEM_FENCE
-using Transpiler.cli: get_local_size, get_global_size, get_group_id
-using GPUArrays: blockdim_x, blockidx_x, threadidx_x, synchronize, synchronize_threads, device
-
-for i = 0:10
-    args = ntuple(x-> Symbol("arg_", x), i)
-    fargs = ntuple(x-> :(broadcast_index($(args[x]), length, global_index)), i)
-    @eval begin
-        function reduce_kernel(state, f, op, v0, A, tmp_local, length, result, $(args...))
-            ui1 = Cuint(1)
-            global_index = get_global_id(0) + ui1
-            local_v0 = v0
-            # Loop sequentially over chunks of input vector
-            while (global_index <= length)
-                element = f(A[global_index], $(fargs...))
-                local_v0 = op(local_v0, element)
-                global_index += get_global_size(0)
-            end
-
-            # Perform parallel reduction
-            local_index = threadidx_x(state)
-            tmp_local[local_index + ui1] = local_v0
-            barrier(CLK_LOCAL_MEM_FENCE)
-            offset = blockdim_x(state) ÷ ui1
-            while offset > 0
-                if (local_index < offset)
-                    other = tmp_local[local_index + offset + ui1]
-                    mine = tmp_local[local_index + ui1]
-                    tmp_local[local_index + ui1] = op(mine, other)
-                end
-                barrier(CLK_LOCAL_MEM_FENCE)
-                offset = offset ÷ Cuint(2)
-            end
-            if local_index == Cuint(0)
-                result[blockidx_x(state) + ui1] = tmp_local[1]
-            end
-            return
-        end
-    end
-end
-
-function acc_mapreduce{T, OT, N}(
-        f, op, v0::OT, A::CLArray{T, N}, rest::Tuple
-    )
-    dev = device(A)
-    block_size = 16
-    group_size = ceil(Int, length(A) / block_size)
-    out = similar(A, OT, (group_size,))
-    fill!(out, v0)
-    lmem = LocalMemory{OT}(block_size)
-    args = (f, op, v0, A, lmem, Cuint(length(A)), out, rest...)
-    gpu_call(reduce_kernel, A, args, (group_size * block_size,), (block_size,))
-    println(Array(out))
-    reduce(op, Array(out))
-end
diff --git a/src/ondevice.jl b/src/ondevice.jl
@@ -26,20 +26,22 @@ start(x::OnDeviceArray) = Cuint(1)
 next(x::OnDeviceArray, state::Cuint) = x[state], state + Cuint(1)
 done(x::OnDeviceArray, state::Cuint) = state > length(x)
 
+getindex(x::OnDeviceArray, ilin::Integer) =  x.ptr[ilin]
 function getindex(x::OnDeviceArray{T, N}, i::Vararg{Integer, N}) where {T, N}
     ilin = gpu_sub2ind(size(x), Cuint.(i))
     return x.ptr[ilin]
 end
-function setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
-    ilin = gpu_sub2ind(size(x), Cuint.(i))
+function setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
     x.ptr[ilin] = T(val)
     return
 end
-function setindex!(x::OnDeviceArray{T, N}, val, ilin::Integer) where {T, N}
+
+function setindex!(x::OnDeviceArray{T, N}, val, i::Vararg{Integer, N}) where {T, N}
+    ilin = gpu_sub2ind(size(x), Cuint.(i))
     x.ptr[ilin] = T(val)
     return
 end
-getindex(x::OnDeviceArray, ilin::Integer) =  x.ptr[ilin]
+
 
 
 kernel_convert(A::CLArray{T, N}) where {T, N} = PreDeviceArray{T, N}(HostPtr{T}(), A.size)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,14 @@
 using CLArrays
 using GPUArrays.TestSuite, Base.Test
-
+using GPUArrays: global_size
+using CUDAnative, CUDAdrv
 TestSuite.run_tests(CLArray)
+
+using CLArrays
+
+x = CLArray(rand(Float32, 10))
+
+GPUArrays.gpu_call(x, (x,)) do state, l
+    l[1] = 1f0 ^ 1.0
+    return
+end