[TritonGPU] Enable accum-init optimization for unconditionally zero-ed accumulators (#6395)

masahi · web-flow · commit aeac283b7b50 · 2025-04-08T18:23:30.000+09:00
Currently, the pass doesn't fire when [there is no explicit op that conditionally clears the accumulator](https://github.com/triton-lang/triton/blob/main/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp#L207-L211). Thus, it misses the simplest case where this optimization is applicable - the accumulator is initialized to zero, and after the first iteration, the accumulator is always updated with +=. The motivation is an IR like below. We want to hoist tmem_alloc outside of the tile loop, but that requires explicitly clearing the accumulator after the K loop for one tile completes. Enabling this optimization for this case allows us to skip the explicit clearing. ``` for tile ... for k ... iter_args(arg9 = cst_zero) acc = tmem_alloc arg9 mma A B acc next_acc = tmem_load acc ... yield next_acc ``` --------- Co-authored-by: Masahiro Masuda <mmasuda@nvidia.com>
diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp
@@ -1,5 +1,6 @@
 #include "mlir/Transforms/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+#include "triton/Dialect/Triton/IR/OpInterfaces.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -113,6 +114,19 @@ void setUseAccFlag(Operation *op, Value useAcc) {
   }
 }
 
+Value getUseAccFlag(Operation *op) {
+  assert(isa<DotOpInterface>(op) && "Expected a dot-like operation");
+  if (auto wgDotOp = dyn_cast<triton::nvidia_gpu::WarpGroupDotOp>(op)) {
+    return wgDotOp.getUseC();
+  } else if (auto tc05MmaOp =
+                 dyn_cast<triton::nvidia_gpu::MMAv5OpInterface>(op)) {
+    return tc05MmaOp.useAccumulator();
+  } else {
+    assert(false && "Unexpected dot-like operation");
+  }
+  return nullptr;
+}
+
 bool isConstantZeroTensor(Value v) {
   return (matchPattern(v, m_Zero()) || matchPattern(v, m_AnyZeroFloat()));
 }
@@ -157,6 +171,18 @@ findZeroInitOp(Value accUse, scf::ForOp forOp, bool &loopArgIsZero) {
   return std::nullopt;
 }
 
+std::optional<bool> getBoolFromConstant(Value cst) {
+  auto constantOp = cst.getDefiningOp<arith::ConstantOp>();
+  if (!constantOp) {
+    return std::nullopt;
+  }
+  assert(constantOp.getValue());
+  if (auto boolAttr = dyn_cast<BoolAttr>(constantOp.getValue())) {
+    return boolAttr.getValue();
+  }
+  return std::nullopt;
+}
+
 } // namespace
 
 class OptimizeAccumulatorInitPass
@@ -206,62 +232,81 @@ class OptimizeAccumulatorInitPass
       bool loopArgIsZero = false;
       std::optional<std::pair<Operation *, int>> zeroInitOp =
           findZeroInitOp(accUse, forOp, loopArgIsZero);
-      if (!zeroInitOp) {
+
+      if (!zeroInitOp && !loopArgIsZero) {
         continue;
       }
 
+      if (auto useAccValue = getUseAccFlag(mmaOp)) {
+        auto useAcc = getBoolFromConstant(useAccValue);
+        if (!useAcc || *useAcc == false) {
+          // Do not run this optimization if there is already a non-constant
+          // flag (this pass has already run), or if this MMA does not use the
+          // accumulator (e.g. the peeled MMA in the prologue, the first dot
+          // in attention)
+          continue;
+        }
+      }
+
       Value loopArgFlagValue = loopArgIsZero ? vFalse : vTrue;
       (void)addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});
       loopArgFlagValue =
           forOp.getRegionIterArg(forOp.getNumRegionIterArgs() - 1);
 
-      Value condition = nullptr;
-      Value oldValue = nullptr;
-      Value zeroValue = nullptr;
-      bool thenInitsToZero = false;
-      if (auto selOp = dyn_cast<arith::SelectOp>(zeroInitOp->first)) {
-        condition = selOp.getCondition();
-        oldValue = isConstantZeroTensor(selOp.getTrueValue())
-                       ? selOp.getFalseValue()
-                       : selOp.getTrueValue();
-        zeroValue = isConstantZeroTensor(selOp.getTrueValue())
-                        ? selOp.getTrueValue()
-                        : selOp.getFalseValue();
-        thenInitsToZero = isConstantZeroTensor(selOp.getTrueValue());
-      } else {
-        assert(isa<scf::IfOp>(*zeroInitOp->first) && "Expected an if op");
-        auto ifOp = cast<scf::IfOp>(zeroInitOp->first);
-        unsigned resultIndex = zeroInitOp->second;
-        condition = ifOp.getCondition();
-        Value thenVal = ifOp.thenYield()->getOperand(resultIndex);
-        Value elseVal = ifOp.elseYield()->getOperand(resultIndex);
-        oldValue = isConstantZeroTensor(thenVal) ? elseVal : thenVal;
-        zeroValue = isConstantZeroTensor(thenVal) ? thenVal : elseVal;
-        thenInitsToZero = isConstantZeroTensor(thenVal);
-      }
+      if (zeroInitOp) {
+        Value condition = nullptr;
+        Value oldValue = nullptr;
+        Value zeroValue = nullptr;
+        bool thenInitsToZero = false;
+        if (auto selOp = dyn_cast<arith::SelectOp>(zeroInitOp->first)) {
+          condition = selOp.getCondition();
+          oldValue = isConstantZeroTensor(selOp.getTrueValue())
+                         ? selOp.getFalseValue()
+                         : selOp.getTrueValue();
+          zeroValue = isConstantZeroTensor(selOp.getTrueValue())
+                          ? selOp.getTrueValue()
+                          : selOp.getFalseValue();
+          thenInitsToZero = isConstantZeroTensor(selOp.getTrueValue());
+        } else {
+          assert(isa<scf::IfOp>(*zeroInitOp->first) && "Expected an if op");
+          auto ifOp = cast<scf::IfOp>(zeroInitOp->first);
+          unsigned resultIndex = zeroInitOp->second;
+          condition = ifOp.getCondition();
+          Value thenVal = ifOp.thenYield()->getOperand(resultIndex);
+          Value elseVal = ifOp.elseYield()->getOperand(resultIndex);
+          oldValue = isConstantZeroTensor(thenVal) ? elseVal : thenVal;
+          zeroValue = isConstantZeroTensor(thenVal) ? thenVal : elseVal;
+          thenInitsToZero = isConstantZeroTensor(thenVal);
+        }
 
-      // Create a select op that updates the flag
-      rewriter.setInsertionPoint(zeroInitOp->first);
-      bool zeroingBeforeMMA = zeroInitOp->first->isBeforeInBlock(mmaOp);
-      Value prevFlagValue = zeroingBeforeMMA ? loopArgFlagValue : vTrue;
-      auto selectFlagOp = rewriter.create<arith::SelectOp>(
-          loc, condition, thenInitsToZero ? vFalse : prevFlagValue,
-          thenInitsToZero ? prevFlagValue : vFalse);
-      setUseAccFlag(mmaOp, zeroingBeforeMMA ? selectFlagOp : loopArgFlagValue);
-      auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
-      forYield->insertOperands(forYield->getNumOperands(),
-                               {zeroingBeforeMMA ? vTrue : selectFlagOp});
+        // Create a select op that updates the flag
+        rewriter.setInsertionPoint(zeroInitOp->first);
+        bool zeroingBeforeMMA = zeroInitOp->first->isBeforeInBlock(mmaOp);
+        Value prevFlagValue = zeroingBeforeMMA ? loopArgFlagValue : vTrue;
+        auto selectFlagOp = rewriter.create<arith::SelectOp>(
+            loc, condition, thenInitsToZero ? vFalse : prevFlagValue,
+            thenInitsToZero ? prevFlagValue : vFalse);
+        setUseAccFlag(mmaOp,
+                      zeroingBeforeMMA ? selectFlagOp : loopArgFlagValue);
+        auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
+        forYield->insertOperands(forYield->getNumOperands(),
+                                 {zeroingBeforeMMA ? vTrue : selectFlagOp});
 
-      // Stop clearing out the accumulator with zero
-      if (auto selOp = dyn_cast<arith::SelectOp>(zeroInitOp->first)) {
-        rewriter.setInsertionPoint(selOp);
-        rewriter.replaceOp(selOp, oldValue);
-      } else {
-        auto ifOp = cast<scf::IfOp>(zeroInitOp->first);
-        int resultIndex = zeroInitOp->second;
-        auto zeroingYield =
-            thenInitsToZero ? ifOp.thenYield() : ifOp.elseYield();
-        zeroingYield.setOperand(resultIndex, oldValue);
+        // Stop clearing out the accumulator with zero
+        if (auto selOp = dyn_cast<arith::SelectOp>(zeroInitOp->first)) {
+          rewriter.setInsertionPoint(selOp);
+          rewriter.replaceOp(selOp, oldValue);
+        } else {
+          auto ifOp = cast<scf::IfOp>(zeroInitOp->first);
+          int resultIndex = zeroInitOp->second;
+          auto zeroingYield =
+              thenInitsToZero ? ifOp.thenYield() : ifOp.elseYield();
+          zeroingYield.setOperand(resultIndex, oldValue);
+        }
+      } else if (loopArgIsZero) {
+        setUseAccFlag(mmaOp, loopArgFlagValue);
+        auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
+        forYield->insertOperands(forYield->getNumOperands(), vTrue);
       }
     }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp
@@ -25,12 +25,16 @@ using Partition = WarpSchedule::Partition;
 // partition or the provided `partition`.
 static void eraseOtherPartitions(scf::ForOp &loop, const WarpSchedule &schedule,
                                  const Partition *partition) {
+  auto inPartition = [&](Operation *op) {
+    const Partition *opPartition =
+        schedule.getPartition(loop.getBody()->findAncestorOpInBlock(*op));
+    return llvm::is_contained({partition, schedule.getRootPartition()},
+                              opPartition);
+  };
   llvm::BitVector toErase(loop.getNumRegionIterArgs(), true);
   for (Operation &op :
        llvm::make_early_inc_range(loop.getBody()->without_terminator())) {
-    const Partition *opPartition = schedule.getPartition(&op);
-    if (!llvm::is_contained({partition, schedule.getRootPartition()},
-                            opPartition)) {
+    if (!inPartition(&op)) {
       op.dropAllUses();
       op.erase();
       continue;
@@ -43,7 +47,9 @@ static void eraseOtherPartitions(scf::ForOp &loop, const WarpSchedule &schedule,
     }
   }
   for (auto [i, arg] : llvm::enumerate(loop.getRegionIterArgs())) {
-    if (toErase.test(i))
+    if (llvm::any_of(arg.getUsers(), inPartition))
+      toErase.reset(i);
+    else if (toErase.test(i))
       arg.dropAllUses();
   }
   eraseLoopCarriedValues(loop, std::move(toErase));
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -469,7 +469,7 @@ def block_scale_mxfp_matmul(  #
 @pytest.mark.parametrize("NUM_STAGES", [1, 2, 4])
 @pytest.mark.parametrize("USE_2D_SCALE_LOAD", [False, True])
 @pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 10, reason="Requires compute capability >= 10")
-def test_blocked_scale_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, USE_2D_SCALE_LOAD, device, monkeypatch):
+def test_blocked_scale_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, USE_2D_SCALE_LOAD, device):
     if BLOCK_N == 256 and BLOCK_K == 256:
         NUM_STAGES = min(NUM_STAGES, 2)
     elif BLOCK_K == 256:
diff --git a/test/TritonGPU/accumulator-init.mlir b/test/TritonGPU/accumulator-init.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -tritongpu-optimize-accumulator-init | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritongpu-optimize-accumulator-init | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
@@ -292,42 +292,27 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     tt.return %17 : tensor<128x16xf32, #mma1>
   }
 
-// Check that we bail out in unsupported cases
-
-// CHECK-LABEL: @non_zero_init
-// CHECK-NOT: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !ttg.memdesc
-  tt.func @non_zero_init(%A: !ttg.memdesc<128x64xf16, #shared, #smem>, %B: !ttg.memdesc<64x16xf16, #shared1, #smem>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
-    %c0_i32 = arith.constant 0 : i32
-    %cst_2 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1>
-    %c1_i32 = arith.constant 1 : i32
-    %c8_i32 = arith.constant 8 : i32
-    %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
-      %cnd = arith.cmpi slt, %arg3, %ext : i32
-      %acc = ttng.warp_group_dot %A, %B, %arg4 : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x16xf16, #shared1, #smem> -> tensor<128x16xf32, #mma1>
-      %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
-      scf.yield %acc_: tensor<128x16xf32, #mma1>
-    }
-    tt.return %17 : tensor<128x16xf32, #mma1>
-  }
-
-// CHECK-LABEL: @zero_init_dist_2
-// CHECK-NOT: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !ttg.memdesc
+  // CHECK-LABEL: @zero_init_dist_2
   tt.func @zero_init_dist_2(%A: !ttg.memdesc<128x64xf16, #shared, #smem>, %B: !ttg.memdesc<64x16xf16, #shared1, #smem>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
+    // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
     %c1_i32 = arith.constant 1 : i32
     %c8_i32 = arith.constant 8 : i32
+    // CHECK: scf.for {{.*}} = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg{{[1-9]+}} = %{{.*}}, %[[ACC:.*]] = %[[CST]], %[[INIT_FLAG:.*]] = %false)
     %17:2 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %cst_2) -> (tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>)  : i32 {
       %cnd = arith.cmpi slt, %arg3, %ext : i32
+      // CHECK: %2 = ttng.warp_group_dot {{.*}}, {{.*}}, %[[ACC]], %[[INIT_FLAG]]
       %acc = ttng.warp_group_dot %A, %B, %arg5 : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x16xf16, #shared1, #smem> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
+      // CHECK: scf.yield {{.*}}, {{.*}}, %true
       scf.yield %acc_, %arg4: tensor<128x16xf32, #mma1>, tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
   }
 
 // CHECK-LABEL: @if_defines_alternative
-// CHECK-NOT: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !ttg.memdesc
+// CHECK: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %arg{{.*}} : !ttg.memdesc
   tt.func @if_defines_alternative(%A: !ttg.memdesc<128x64xf16, #shared, #smem>, %B: !ttg.memdesc<64x16xf16, #shared1, #smem>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
@@ -343,13 +328,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
         %acc_alt = arith.addf %acc, %cst_3 : tensor<128x16xf32, #mma1>
         scf.yield %acc_alt : tensor<128x16xf32, #mma1>
       }
+      // CHECK: scf.yield {{.*}}, %true
       scf.yield %acc_: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
   }
 
 // CHECK-LABEL: @non_cond_override
-// CHECK-NOT: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !ttg.memdesc
+// CHECK: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, %arg{{.*}} : !ttg.memdesc
   tt.func @non_cond_override(%A: !ttg.memdesc<128x64xf16, #shared, #smem>, %B: !ttg.memdesc<64x16xf16, #shared1, #smem>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
     %c0_i32 = arith.constant 0 : i32
     %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
@@ -359,6 +345,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
       %acc = ttng.warp_group_dot %A, %B, %arg4 : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x16xf16, #shared1, #smem> -> tensor<128x16xf32, #mma1>
       %acc_ = arith.addf %acc, %cst_3 : tensor<128x16xf32, #mma1>
+      // CHECK: scf.yield {{.*}}, %true
+      scf.yield %acc_: tensor<128x16xf32, #mma1>
+    }
+    tt.return %17 : tensor<128x16xf32, #mma1>
+  }
+
+
+// Check that we bail out in unsupported cases
+
+// CHECK-LABEL: @non_zero_init
+// CHECK-NOT: %[[ACC_NEXT:.+]] = ttng.warp_group_dot {{.*}}, {{.*}}, {{.*}}, {{.*}} : !ttg.memdesc
+  tt.func @non_zero_init(%A: !ttg.memdesc<128x64xf16, #shared, #smem>, %B: !ttg.memdesc<64x16xf16, #shared1, #smem>, %arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> {
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<1.000000e+00> : tensor<128x16xf32, #mma1>
+    %c1_i32 = arith.constant 1 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %17 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2) -> (tensor<128x16xf32, #mma1>)  : i32 {
+      %cnd = arith.cmpi slt, %arg3, %ext : i32
+      %acc = ttng.warp_group_dot %A, %B, %arg4 : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x16xf16, #shared1, #smem> -> tensor<128x16xf32, #mma1>
+      %acc_ = arith.select %cnd, %cst_2, %acc : tensor<128x16xf32, #mma1>
       scf.yield %acc_: tensor<128x16xf32, #mma1>
     }
     tt.return %17 : tensor<128x16xf32, #mma1>
diff --git a/test/TritonGPU/loop-pipeline-blackwell.mlir b/test/TritonGPU/loop-pipeline-blackwell.mlir