Skip to content

Commit 3a52020

Browse files
committed
performance fixes
1 parent 7c4e254 commit 3a52020

File tree

3 files changed

+17
-28
lines changed

3 files changed

+17
-28
lines changed

src/backends/cudanative/cudanative.jl

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ end
125125

126126
thread_blocks_heuristic{N}(s::NTuple{N, Integer}) = thread_blocks_heuristic(prod(s))
127127
function thread_blocks_heuristic(len::Integer)
128-
threads = min(len, 1024)
128+
threads = min(len, 256)
129129
blocks = ceil(Int, len/threads)
130130
blocks, threads
131131
end
@@ -306,25 +306,6 @@ function acc_mapreduce{T, OT, N}(
306306
end
307307

308308

309-
# TODO figure out how interact with CUDArt and CUDAdr
310-
#GFFT = GPUArray(Complex64, div(size(G,1),2)+1, size(G,2))
311-
# function Base.fft!(A::CUArray)
312-
# G, GFFT = CUFFT.RCpair(A)
313-
# fft!(G, GFFT)
314-
# end
315-
# function Base.fft!(out::CUArray, A::CUArray)
316-
# plan(out, A)(out, A, true)
317-
# end
318-
#
319-
# function Base.ifft!(A::CUArray)
320-
# G, GFFT = CUFFT.RCpair(A)
321-
# ifft!(G, GFFT)
322-
# end
323-
# function Base.ifft!(out::CUArray, A::CUArray)
324-
# plan(out, A)(out, A, false)
325-
# end
326-
327-
328309
########################################
329310
# CUBLAS
330311

@@ -343,8 +324,7 @@ if is_blas_supported(:CUBLAS)
343324
# # implement blas interface
344325
hasblas(::CUContext) = true
345326
blas_module(::CUContext) = CUBLAS
346-
blasbuffer(ctx::CUContext, A) = to_cudart(A)
347-
327+
blasbuffer(ctx::CUContext, A) = buffer(A)
348328
end
349329

350330
if is_fft_supported(:CUFFT)

src/backends/julia/julia.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,8 @@ for i = 0:7
117117
fargs = ntuple(x-> :(broadcast_index(args[$x], sz, i)), i)
118118
fidxargs = ntuple(x-> :(args[$x]), i)
119119
@eval begin
120-
121120
function mapidx{F, T, N}(f::F, data::JLArray{T, N}, args::NTuple{$i, Any})
122-
for i in eachindex(data)
121+
@threads for i in eachindex(data)
123122
f(i, data, $(fidxargs...))
124123
end
125124
end
@@ -168,7 +167,7 @@ function gpu_call(f, A::JLArray, args, globalsize = length(A), local_size = 0)
168167
parallel_kernel(0, len, f, unpacked_args)
169168
return
170169
end
171-
for id = 1:n
170+
@threads for id = 1:n
172171
parallel_kernel((id - 1) * width, width, f, unpacked_args)
173172
end
174173
len_floored = width * n

src/backends/opencl/opencl.jl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ function free{T, N}(x::CLArray{T, N})
8888
nothing
8989
end
9090

91-
linear_index(::cli.CLArray, state) = get_global_id(0) + Cuint(1)
92-
91+
function linear_index(::cli.CLArray, state)
92+
(get_local_size(0)*get_group_id(0) + get_local_id(0)) + Cuint(1)
93+
end
9394

9495
function cl_readbuffer(q, buf, dev_offset, hostref, nbytes)
9596
n_evts = UInt(0)
@@ -205,6 +206,14 @@ function (clfunc::CLFunction{T}){T, T2, N}(A::CLArray{T2, N}, args...)
205206
clfunc(args, length(A))
206207
end
207208

209+
function thread_blocks_heuristic(len::Integer)
210+
threads = min(len, 256)
211+
blocks = ceil(Int, len/threads)
212+
blocks = blocks * threads
213+
blocks, threads
214+
end
215+
216+
208217
function gpu_call{T, N}(f, A::CLArray{T, N}, args, globalsize = length(A), localsize = nothing)
209218
ctx = GPUArrays.context(A)
210219
_args = if !isa(f, Tuple{String, Symbol})
@@ -213,7 +222,8 @@ function gpu_call{T, N}(f, A::CLArray{T, N}, args, globalsize = length(A), local
213222
args
214223
end
215224
clfunc = CLFunction(f, _args, ctx.queue)
216-
clfunc(_args, globalsize, localsize)
225+
blocks, thread = thread_blocks_heuristic(globalsize)
226+
clfunc(_args, blocks, thread)
217227
end
218228

219229
###################

0 commit comments

Comments
 (0)