@@ -207,11 +207,6 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
207
207
return R
208
208
end
209
209
210
- # allocate an additional, empty dimension to write the reduced value to.
211
- # this does not affect the actual location in memory of the final values,
212
- # but allows us to write a generalized kernel supporting partial reductions.
213
- R′ = reshape (R, (size (R)... , 1 ))
214
-
215
210
# how many threads do we want?
216
211
#
217
212
# threads in a block work together to reduce values across the reduction dimensions;
@@ -231,7 +226,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
231
226
# we might not be able to launch all those threads to reduce each slice in one go.
232
227
# that's why each threads also loops across their inputs, processing multiple values
233
228
# so that we can span the entire reduction dimension using a single thread block.
234
- kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), R′ , A)
229
+ kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), R, A)
235
230
compute_shmem (threads) = shuffle ? 0 : threads* sizeof (T)
236
231
kernel_config = launch_configuration (kernel. fun; shmem= compute_shmem∘ compute_threads)
237
232
reduce_threads = compute_threads (kernel_config. threads)
@@ -258,7 +253,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
258
253
# perform the actual reduction
259
254
if reduce_blocks == 1
260
255
# we can cover the dimensions to reduce using a single block
261
- kernel (f, op, init, Rreduce, Rother, Val (shuffle), R′ , A; threads, blocks, shmem)
256
+ kernel (f, op, init, Rreduce, Rother, Val (shuffle), R, A; threads, blocks, shmem)
262
257
else
263
258
# we need multiple steps to cover all values to reduce
264
259
partial = similar (R, (size (R)... , reduce_blocks))
@@ -271,7 +266,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
271
266
@cuda (threads, blocks, shmem,
272
267
partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), partial, A))
273
268
274
- GPUArrays. mapreducedim! (identity, op, R′ , partial; init= init)
269
+ GPUArrays. mapreducedim! (identity, op, R, partial; init= init)
275
270
end
276
271
277
272
return R
0 commit comments