Skip to content

Commit 64f4a23

Browse files
authored
Remove the unnecessary reshape during mapreduce. (#2778)
1 parent bb8259f commit 64f4a23

File tree

1 file changed

+3
-8
lines changed

1 file changed

+3
-8
lines changed

src/mapreduce.jl

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -207,11 +207,6 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
207207
return R
208208
end
209209

210-
# allocate an additional, empty dimension to write the reduced value to.
211-
# this does not affect the actual location in memory of the final values,
212-
# but allows us to write a generalized kernel supporting partial reductions.
213-
R′ = reshape(R, (size(R)..., 1))
214-
215210
# how many threads do we want?
216211
#
217212
# threads in a block work together to reduce values across the reduction dimensions;
@@ -231,7 +226,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
231226
# we might not be able to launch all those threads to reduce each slice in one go.
232227
# that's why each threads also loops across their inputs, processing multiple values
233228
# so that we can span the entire reduction dimension using a single thread block.
234-
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
229+
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
235230
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
236231
kernel_config = launch_configuration(kernel.fun; shmem=compute_shmemcompute_threads)
237232
reduce_threads = compute_threads(kernel_config.threads)
@@ -258,7 +253,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
258253
# perform the actual reduction
259254
if reduce_blocks == 1
260255
# we can cover the dimensions to reduce using a single block
261-
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
256+
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
262257
else
263258
# we need multiple steps to cover all values to reduce
264259
partial = similar(R, (size(R)..., reduce_blocks))
@@ -271,7 +266,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
271266
@cuda(threads, blocks, shmem,
272267
partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A))
273268

274-
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
269+
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
275270
end
276271

277272
return R

0 commit comments

Comments
 (0)