Avoid redundant reduction step. (#4774)

ienkovich · web-flow · commit 2c1505e1a9a5 · 2025-07-24T17:54:50.000-05:00
The redundant step of reduction of partial accumulators can be done,
causing inefficiency and unguarded shared memory access.

Signed-off-by: Ilya Enkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir b/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s --split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 // COM: Tests reduction when threads_per_warp < num_warps.
 
@@ -35,3 +35,24 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 64 : i32, "ttg.th
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [32, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 32 : i32} {
+  tt.func public @test_reduce(%arg0: tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>> {
+    // CHECK:     llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
+    // CHECK:     llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
+    // CHECK:     llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
+    // CHECK:     llvm.call spir_funccc @_Z27__spirv_GroupNonUniformFAddiif
+    // CHECK:     llvm.store
+    // CHECK-NOT: llvm.load
+    // CHECK:     llvm.call spir_funccc @_Z7barrierj
+    %1 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
+    ^bb0(%arg2: f32, %arg3: f32):
+      %2 = arith.addf %arg2, %arg3 : f32
+      tt.reduce.return %2 : f32
+    }) {allocation.offset = 0 : i32} : (tensor<32x128xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    tt.return %1 : tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp
@@ -292,7 +292,7 @@ struct ReduceOpConversion
     // size is [elems / sizeInterWarps, N] -> [elems / sizeInterWarps, ceil(N,
     // numLanes)] in each reduce iteration.
     unsigned problemBatchSize = elems / sizeInterWarps;
-    for (unsigned problemSize = sizeInterWarps; problemSize > 0;
+    for (unsigned problemSize = sizeInterWarps; problemSize > 1;
          problemSize = problemSize / numLanes) {
       unsigned reduceLaneNumber = std::min(problemSize, numLanes);
       unsigned totalProblemSizePerIter = problemSize * problemBatchSize;