performance fixes

SimonDanisch · SimonDanisch · commit 3a520207dac9 · 2017-08-21T18:20:38.000+02:00
diff --git a/src/backends/cudanative/cudanative.jl b/src/backends/cudanative/cudanative.jl
@@ -125,7 +125,7 @@ end
 
 thread_blocks_heuristic{N}(s::NTuple{N, Integer}) = thread_blocks_heuristic(prod(s))
 function thread_blocks_heuristic(len::Integer)
-    threads = min(len, 1024)
+    threads = min(len, 256)
     blocks = ceil(Int, len/threads)
     blocks, threads
 end
@@ -306,25 +306,6 @@ function acc_mapreduce{T, OT, N}(
 end
 
 
-#  TODO figure out how interact with CUDArt and CUDAdr
-#GFFT = GPUArray(Complex64, div(size(G,1),2)+1, size(G,2))
-# function Base.fft!(A::CUArray)
-#     G, GFFT = CUFFT.RCpair(A)
-#     fft!(G, GFFT)
-# end
-# function Base.fft!(out::CUArray, A::CUArray)
-#     plan(out, A)(out, A, true)
-# end
-#
-# function Base.ifft!(A::CUArray)
-#     G, GFFT = CUFFT.RCpair(A)
-#     ifft!(G, GFFT)
-# end
-# function Base.ifft!(out::CUArray, A::CUArray)
-#     plan(out, A)(out, A, false)
-# end
-
-
 ########################################
 # CUBLAS
 
@@ -343,8 +324,7 @@ if is_blas_supported(:CUBLAS)
     # # implement blas interface
     hasblas(::CUContext) = true
     blas_module(::CUContext) = CUBLAS
-    blasbuffer(ctx::CUContext, A) = to_cudart(A)
-
+    blasbuffer(ctx::CUContext, A) = buffer(A)
 end
 
 if is_fft_supported(:CUFFT)
diff --git a/src/backends/julia/julia.jl b/src/backends/julia/julia.jl
@@ -117,9 +117,8 @@ for i = 0:7
     fargs = ntuple(x-> :(broadcast_index(args[$x], sz, i)), i)
     fidxargs = ntuple(x-> :(args[$x]), i)
     @eval begin
-
         function mapidx{F, T, N}(f::F, data::JLArray{T, N}, args::NTuple{$i, Any})
-            for i in eachindex(data)
+            @threads for i in eachindex(data)
                 f(i, data, $(fidxargs...))
             end
         end
@@ -168,7 +167,7 @@ function gpu_call(f, A::JLArray, args, globalsize = length(A), local_size = 0)
         parallel_kernel(0, len, f, unpacked_args)
         return
     end
-    for id = 1:n
+    @threads for id = 1:n
         parallel_kernel((id - 1) * width, width, f, unpacked_args)
     end
     len_floored = width * n
diff --git a/src/backends/opencl/opencl.jl b/src/backends/opencl/opencl.jl
@@ -88,8 +88,9 @@ function free{T, N}(x::CLArray{T, N})
     nothing
 end
 
-linear_index(::cli.CLArray, state) = get_global_id(0) + Cuint(1)
-
+function linear_index(::cli.CLArray, state)
+    (get_local_size(0)*get_group_id(0) + get_local_id(0)) + Cuint(1)
+end
 
 function cl_readbuffer(q, buf, dev_offset, hostref, nbytes)
     n_evts  = UInt(0)
@@ -205,6 +206,14 @@ function (clfunc::CLFunction{T}){T, T2, N}(A::CLArray{T2, N}, args...)
     clfunc(args, length(A))
 end
 
+function thread_blocks_heuristic(len::Integer)
+    threads = min(len, 256)
+    blocks = ceil(Int, len/threads)
+    blocks = blocks * threads
+    blocks, threads
+end
+
+
 function gpu_call{T, N}(f, A::CLArray{T, N}, args, globalsize = length(A), localsize = nothing)
     ctx = GPUArrays.context(A)
     _args = if !isa(f, Tuple{String, Symbol})
@@ -213,7 +222,8 @@ function gpu_call{T, N}(f, A::CLArray{T, N}, args, globalsize = length(A), local
         args
     end
     clfunc = CLFunction(f, _args, ctx.queue)
-    clfunc(_args, globalsize, localsize)
+    blocks, thread = thread_blocks_heuristic(globalsize)
+    clfunc(_args, blocks, thread)
 end
 
 ###################