@@ -148,8 +148,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
148148 Base. check_reducedims (R, A)
149149 length (A) == 0 && return R # isempty(::Broadcasted) iterates
150150
151- # be conservative about using shuffle instructions
152- shuffle = T <: Union{Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8}
151+ # be conservative about using shuffle instructions
152+ shuffle = T <: Union{Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8}
153153
154154 # add singleton dimensions to the output container, if needed
155155 if ndims (R) < ndims (A)
@@ -184,8 +184,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
184184 Int (dev. maxThreadgroupMemoryLength) ÷ sizeof (T))
185185
186186 # also want to make sure the grain size is not too high as to starve threads of work.
187- # as a simple heuristic, assume we can launch the maximum number of threads.
188- grain = min (grain, prevpow (2 , cld (length (Rreduce), maxthreads)))
187+ # as a simple heuristic, ensure we can launch the maximum number of threads.
188+ grain = min (grain, nextpow (2 , cld (length (Rreduce), maxthreads)))
189189
190190 # how many threads can we launch?
191191 #
0 commit comments