@@ -131,14 +131,16 @@ for i = 0:10
131
131
fargs = ntuple (x-> :(simple_broadcast_index ($ (args[x]), cartesian_global_index... )), i)
132
132
@eval begin
133
133
# http://developer.amd.com/resources/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
134
- function reduce_kernel (state, f, op, v0:: T , A, :: Val{LMEM} , result, $ (args... )) where {T, LMEM}
134
+ function reduce_kernel (state, f, op, v0:: T , A, len, ax, :: Val{LMEM} , result, $ (args... )) where {T, LMEM}
135
135
tmp_local = @LocalMemory (state, T, LMEM)
136
136
global_index = linear_index (state)
137
137
acc = v0
138
138
# # Loop sequentially over chunks of input vector
139
- while global_index <= length (A)
140
- cartesian_global_index = Tuple (CartesianIndices (axes (A))[global_index])
141
- element = f (A[cartesian_global_index... ], $ (fargs... ))
139
+ # HACK: length(A) and axes(A) aren't GPU compatible, so pass them instead
140
+ # https://github.com/JuliaGPU/CUDAnative.jl/issues/367
141
+ while global_index <= len
142
+ cartesian_global_index = Tuple (CartesianIndices (ax)[global_index])
143
+ @inbounds element = f (A[cartesian_global_index... ], $ (fargs... ))
142
144
acc = op (acc, element)
143
145
global_index += global_size (state)
144
146
end
@@ -182,7 +184,7 @@ function acc_mapreduce(f, op, v0::OT, A::GPUSrcArray, rest::Tuple) where {OT}
182
184
end
183
185
out = similar (A, OT, (blocksize,))
184
186
fill! (out, v0)
185
- args = (f, op, v0, A, Val {threads} (), out, rest... )
187
+ args = (f, op, v0, A, length (A), axes (A), Val {threads} (), out, rest... )
186
188
gpu_call (reduce_kernel, out, args, ((blocksize,), (threads,)))
187
189
reduce (op, Array (out))
188
190
end
0 commit comments