Fix serial reductions (#662)

christiangnrd · web-flow · commit fdff9e31e32b · 2025-09-30T13:07:43.000-03:00
* Test serial_mapreduce

* Fix serial mapreduce kernel

* Properly calculate number of threads for serial mapreduce

* Bump version
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metal"
 uuid = "dde4c033-4e86-420c-a63e-0dd931031962"
-version = "1.8.0"
+version = "1.8.1"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/README.md b/README.md
@@ -66,11 +66,11 @@ julia> Metal.versioninfo()
 macOS 26.0.0, Darwin 25.0.0
 
 Toolchain:
-- Julia: 1.11.6
+- Julia: 1.11.7
 - LLVM: 16.0.6
 
 Julia packages:
-- Metal.jl: 1.8.0
+- Metal.jl: 1.8.1
 - GPUArrays: 11.2.5
 - GPUCompiler: 1.6.1
 - KernelAbstractions: 0.9.38
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -140,8 +140,8 @@ function partial_mapreduce_device(f, op, neutral, maxthreads, ::Val{Rreduce},
     return
 end
 
-function big_mapreduce_kernel(f, op, neutral, ::Val{Rreduce}, ::Val{Rother}, R, As) where {Rreduce, Rother}
-    grid_idx = thread_position_in_threadgroup_1d() + (threadgroup_position_in_grid_1d() - 1u32) * threadgroups_per_grid_1d()
+function serial_mapreduce_kernel(f, op, neutral, ::Val{Rreduce}, ::Val{Rother}, R, As) where {Rreduce, Rother}
+    grid_idx = thread_position_in_grid_1d()
 
     @inbounds if grid_idx <= length(Rother)
         Iother = Rother[grid_idx]
@@ -166,7 +166,7 @@ end
 
 ## COV_EXCL_STOP
 
-_big_mapreduce_threshold(dev) = dev.maxThreadsPerThreadgroup.width * num_gpu_cores()
+serial_mapreduce_threshold(dev) = dev.maxThreadsPerThreadgroup.width * num_gpu_cores()
 
 function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
@@ -194,10 +194,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     @assert length(Rother) > 0
 
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
-    if length(Rother) >= _big_mapreduce_threshold(device(R))
-        threads = min(length(Rreduce), 512)
+    if length(Rother) >= serial_mapreduce_threshold(device(R))
+        kernel = @metal launch=false serial_mapreduce_kernel(f, op, init, Val(Rreduce), Val(Rother), R, A)
+        threads = min(length(Rother), kernel.pipeline.maxTotalThreadsPerThreadgroup)
         groups = cld(length(Rother), threads)
-        kernel = @metal threads groups big_mapreduce_kernel(f, op, init, Val(Rreduce), Val(Rother), R, A)
+        kernel(f, op, init, Val(Rreduce), Val(Rother), R, A; threads, groups)
         return R
     end
 
diff --git a/src/utilities.jl b/src/utilities.jl
@@ -22,7 +22,7 @@ function versioninfo(io::IO=stdout)
     println(io, "- LLVM: $(LLVM.version())")
     println(io)
 
-    println(io, "Julia packages: ")
+    println(io, "Julia packages:")
     println(io, "- Metal.jl: $(Base.pkgversion(Metal))")
     for name in [:GPUArrays, :GPUCompiler, :KernelAbstractions, :ObjectiveC,
                  :LLVM, :LLVMDowngrader_jll]
diff --git a/test/array.jl b/test/array.jl
@@ -568,3 +568,39 @@ end
 
 
 end
+
+@testset "large map reduce" begin
+  dev = device()
+
+  big_size = Metal.serial_mapreduce_threshold(dev) + 5
+  a = rand(Float32, big_size, 31)
+  c = MtlArray(a)
+
+  expected = minimum(a, dims=2)
+  actual = minimum(c, dims=2)
+  @test expected == Array(actual)
+
+  expected = findmax(a, dims=2)
+  actual = findmax(c, dims=2)
+  @test expected == map(Array, actual)
+
+  expected = sum(a, dims=2)
+  actual = sum(c, dims=2)
+  @test expected == Array(actual)
+
+  a = rand(Int, big_size, 31)
+  c = MtlArray(a)
+
+  expected = minimum(a, dims=2)
+  actual = minimum(c, dims=2)
+  @test expected == Array(actual)
+
+  expected = findmax(a, dims=2)
+  actual = findmax(c, dims=2)
+  @test expected == map(Array, actual)
+
+  expected = sum(a, dims=2)
+  actual = sum(c, dims=2)
+  @test expected == Array(actual)
+end
+