Minor mapreduce improvements (#303)

maleadt · web-flow · commit 9f23773acb77 · 2024-03-05T13:33:10.000+01:00
diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
@@ -86,8 +86,8 @@ Returns `a * b + c`.
 
 ## SIMD Shuffle Up/Down
 
-simd_shuffle_map = ((Float32, "f16"),
-                    (Float16, "f32"),
+simd_shuffle_map = ((Float32, "f32"),
+                    (Float16, "f16"),
                     (Int32,   "s.i32"),
                     (UInt32,  "u.i32"),
                     (Int16,   "s.i16"),
@@ -133,4 +133,4 @@ modify the lower delta lanes of data because it doesn’t wrap values around the
 
 T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
-simd_shuffle_up
+simd_shuffle_up
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -148,8 +148,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     Base.check_reducedims(R, A)
     length(A) == 0 && return R # isempty(::Broadcasted) iterates
 
-     # be conservative about using shuffle instructions
-     shuffle = T <: Union{Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8}
+    # be conservative about using shuffle instructions
+    shuffle = T <: Union{Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8}
 
     # add singleton dimensions to the output container, if needed
     if ndims(R) < ndims(A)
@@ -184,8 +184,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                      Int(dev.maxThreadgroupMemoryLength) ÷ sizeof(T))
 
     # also want to make sure the grain size is not too high as to starve threads of work.
-    # as a simple heuristic, assume we can launch the maximum number of threads.
-    grain = min(grain, prevpow(2, cld(length(Rreduce), maxthreads)))
+    # as a simple heuristic, ensure we can launch the maximum number of threads.
+    grain = min(grain, nextpow(2, cld(length(Rreduce), maxthreads)))
 
     # how many threads can we launch?
     #