[BW][PIPELINE] Add an option to tl.range to disallow accumulator multi-buffering (triton-lang#5858)

pawelszczerbuk · web-flow · commit e4e668774c59 · 2025-02-08T09:22:23.000-08:00
Rework mmav5 pipelining to allow pipelining of mma when multibuffering
of the accumulator is impossible by putting uses in the same stage as
the mma and blocking on wait until current mma finishes.
Based on this support, introducing new flag to `tl.range` that controls
if multibuffering of the accumulator of the dots in the loop is allowed.
Without the mentioned rework of mmav5 pipelining we would simply not
pipeline cases where mutibuffering is disallowed.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -10,6 +10,8 @@ namespace mlir {
 namespace triton {
 
 static const char *kNumStagesAttrName = "tt.num_stages";
+static const char *kDisallowAccMultiBufferAttrName =
+    "tt.disallow_acc_multi_buffer";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 
@@ -37,6 +39,10 @@ void addOps(scf::ForOp forOp, int stage,
 void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
                                  Value val);
 
+// Return true if the given ForOp has the attribute
+// `tt.disallow_acc_multi_buffer` set to true.
+bool getDisallowAccMultiBuffer(scf::ForOp forOp);
+
 // Return the minClusterId and maxClusterId for the given ForOp.
 std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
 std::pair<int, int> getStageCluster(Operation *op);
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -234,6 +234,12 @@ void mlir::triton::replaceUsesAndPropagateType(OpBuilder &builder,
     op->erase();
 }
 
+// Return true if the given ForOp has the attribute
+// `tt.disallow_acc_multi_buffer` set to true.
+bool mlir::triton::getDisallowAccMultiBuffer(scf::ForOp forOp) {
+  return forOp->hasAttr(mlir::triton::kDisallowAccMultiBufferAttrName);
+}
+
 std::optional<std::pair<int, int>>
 mlir::triton::maybeGetStageCluster(Operation *op) {
   auto stage =
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TC05MMAPipeline.cpp
@@ -35,12 +35,17 @@ void annotateWithPipelineStage(IRRewriter &builder, Operation *op, int stage) {
               IntegerAttr::get(builder.getI32Type(), stage));
 }
 
+int getPipelineStage(Operation *op) {
+  return op->getAttrOfType<IntegerAttr>(kPipelineStageAttrName).getInt();
+}
+
 struct MMAInfo {
   struct AccOverridePoint {
     Operation *op;
     Value condition = nullptr;
     Value initValue = nullptr;
     int distance = 0;
+    bool isFlag = false;
   };
 
   ttng::TMEMAllocOp accAlloc; // Directly precedes the dot, allocating tmem
@@ -136,6 +141,7 @@ std::optional<MMAInfo::AccOverridePoint>
 getAccOverridePointInLoop(scf::ForOp forOp, ttng::TMEMAllocOp accUse,
                           ttng::TMEMLoadOp accDef) {
   MMAInfo::AccOverridePoint accOverridePoint;
+  accOverridePoint.isFlag = false;
   DenseSet<Value> seen;
   Value v = accUse.getSrc();
   if (v == nullptr) {
@@ -219,6 +225,7 @@ getAccUseFlagFalseInLoop(scf::ForOp forOp, Value useAccFlagUse) {
 
   IRRewriter builder(v.getDefiningOp()->getNextNode());
   MMAInfo::AccOverridePoint accOverridePoint;
+  accOverridePoint.isFlag = true;
   accOverridePoint.distance = dist;
   Location loc = v.getDefiningOp()->getLoc();
   auto vTrue =
@@ -374,9 +381,12 @@ void updateAccUsesInLoop(IRRewriter &builder, scf::ForOp forOp, MMAInfo &info,
     }
     auto load = builder.create<ttng::TMEMLoadOp>(
         domOp->getLoc(), info.accLoad.getType(), extractSlice);
+    // If accumulator is multi-buffered, it is implicit that we put the load
+    // in the last stage.
+    int pipelineStage = info.accIsMultiBuffered ? numStages - 1 : 0;
     annotateWithPipelineStage(
         builder, forOp.getBody()->findAncestorOpInBlock(*load.getOperation()),
-        numStages - 1);
+        pipelineStage);
     for (auto user : directUses) {
       user->replaceUsesOfWith(info.accLoad, load);
     }
@@ -574,12 +584,45 @@ void createBarrierAndWaitOps(IRRewriter &builder, scf::ForOp forOp,
   info.barrierIdx = newBarrierIdx;
   annotateWithPipelineStage(builder, info.barrierIdx.getDefiningOp(), 0);
 
+  Value originalPhase = info.phase;
   Value newPhase = builder.create<arith::SelectOp>(
       loc, info.phase.getType(), barWrap,
       builder.create<arith::XOrIOp>(loc, info.phase, one), info.phase);
   replaceAllUsesDominatedBy(newPhase.getDefiningOp(), newPhase, info.phase);
   info.phase = newPhase;
   annotateWithPipelineStage(builder, info.phase.getDefiningOp(), 0);
+
+  // We need to add a barrier before load from the accumulator, if it is in the
+  // same stage as the dot.
+  ttng::TMEMLoadOp tmemLoad = nullptr;
+  SmallVector<Operation *> users = {info.accAlloc->getUsers().begin(),
+                                    info.accAlloc->getUsers().end()};
+  while (!users.empty()) {
+    auto user = users.pop_back_val();
+    if (isa<ttg::MemDescSubviewOp>(user)) {
+      users.append(user->getUsers().begin(), user->getUsers().end());
+    }
+    if (isa<ttng::TMEMLoadOp>(user) && forOp->isAncestor(user)) {
+      if (tmemLoad) {
+        assert(tmemLoad == cast<ttng::TMEMLoadOp>(user) &&
+               "Should have only one tmem load from the accumulator");
+      }
+      tmemLoad = cast<ttng::TMEMLoadOp>(user);
+    }
+  }
+  if (tmemLoad) {
+    int loadStage =
+        getPipelineStage(forOp.getBody()->findAncestorOpInBlock(*tmemLoad));
+    int mmaOpStage = getPipelineStage(mmaOp);
+    if (loadStage == mmaOpStage) {
+      builder.setInsertionPoint(tmemLoad);
+      auto barrier =
+          builder.create<ttng::WaitBarrierOp>(loc, barrierSlice, originalPhase);
+      annotateWithPipelineStage(
+          builder, forOp.getBody()->findAncestorOpInBlock(*barrier),
+          mmaOpStage);
+    }
+  }
 }
 
 bool isSafeToPipeline(ttng::TCGen5MMAScaledOp scaledDot) {
@@ -684,17 +727,33 @@ FailureOr<scf::ForOp> preProcessLoopForTC05MMAPipelining(scf::ForOp forOp,
       continue;
     }
 
+    SmallVector<Operation *> accUses = getDirectAccUses(accLoad);
+    DominanceInfo domOpInfo(forOp);
+    Operation *newAccLoadInsertPoint =
+        findNearestCommonDominator(accUses, domOpInfo);
     // Check pipelining and multi-buffering constraints
-    // 1. If the acc is used by an op in the loop (other than the dot) it
-    // requires multi-buffering to pipeline, as different stages cannot operate
-    // on the same buffer.
-    bool requiresMultiBuffer = !getDirectAccUses(accLoad).empty();
+    // 1. Really needs multibuffering - if the acc is used unconditionally in
+    // the loop, or under different conditions. If we cannot multibuffer in this
+    // case, we may as well not pipeline at all, as we will have to wait after
+    // the dot in every loop iteration.
+    scf::IfOp topLevelIf =
+        newAccLoadInsertPoint
+            ? dyn_cast<scf::IfOp>(forOp.getBody()->findAncestorOpInBlock(
+                  *newAccLoadInsertPoint))
+            : nullptr;
+    bool requiresMultiBuffer = accUses.size() > 0 && !topLevelIf;
+    // If we override the acc in the loop, it is generally hard to handle it
+    // without multibuffering. We make an exception if it not a physical
+    // override of a value, but just setting a flag that acc is not used. In
+    // this case we don't need different buffer to store init value.
+    requiresMultiBuffer |=
+        accOverridePoint.has_value() && !accOverridePoint->isFlag;
 
     // 2. If the acc is not owerwritten in the loop (by op other than the dot),
     // it cannot be multi-buffered. This is because the overwrite is the only
     // way to initialize next buffer without incurring a copy.
-    bool canMultiBuffer = accOverridePoint.has_value();
-
+    bool canMultiBuffer = accOverridePoint.has_value() &&
+                          !mlir::triton::getDisallowAccMultiBuffer(forOp);
     if (requiresMultiBuffer && !canMultiBuffer) {
       continue;
     }
@@ -703,7 +762,7 @@ FailureOr<scf::ForOp> preProcessLoopForTC05MMAPipelining(scf::ForOp forOp,
                        .accLoad = accLoad,
                        .accDef = accOverridePoint,
                        .yieldArgNo = yieldArgNo,
-                       .accIsMultiBuffered = requiresMultiBuffer};
+                       .accIsMultiBuffered = canMultiBuffer};
 
     builder.setInsertionPoint(forOp);
     Value zero = builder.create<arith::ConstantIntOp>(forOp.getLoc(), 0, 32);
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -147,7 +147,8 @@ def simple_persistent_kernel(a_ptr, b_ptr, c_ptr, M, N, K, stride_am, stride_ak,
                              stride_bk, stride_bn,  #
                              stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
                              BLOCK_SIZE_K: tl.constexpr,  #
-                             GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+                             GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr,
+                             DISALLOW_ACC_MULTI_BUFFER: tl.constexpr):
     start_pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
@@ -171,7 +172,7 @@ def simple_persistent_kernel(a_ptr, b_ptr, c_ptr, M, N, K, stride_am, stride_ak,
 
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
-    for _ in range(0, k_tiles * tiles_per_SM):
+    for _ in tl.range(0, k_tiles * tiles_per_SM, disallow_acc_multi_buffer=DISALLOW_ACC_MULTI_BUFFER):
         ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
         if ki == 0:
             tile_id += NUM_SMS
@@ -220,7 +221,8 @@ def simple_persistent_kernel(a_ptr, b_ptr, c_ptr, M, N, K, stride_am, stride_ak,
 @pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(128, 128, 16), (64, 128, 32), (32, 32, 32), (256, 128, 16),
                                                        (64, 512, 16), (512, 64, 16), (64, 16, 16)])
 @pytest.mark.parametrize("NUM_WARPS", [4, 8])
-def test_simple_persistent_matmul(BLOCK_M, BLOCK_N, BLOCK_K, NUM_WARPS, device):
+@pytest.mark.parametrize("DISALLOW_ACC_MULTI_BUFFER", [True, False])
+def test_simple_persistent_matmul(BLOCK_M, BLOCK_N, BLOCK_K, NUM_WARPS, DISALLOW_ACC_MULTI_BUFFER, device):
     M, N, K = 1024, 512, 256
     NUM_STAGES = 3
     a = torch.randn(M, K, dtype=torch.float16, device=device)
@@ -238,7 +240,8 @@ def test_simple_persistent_matmul(BLOCK_M, BLOCK_N, BLOCK_K, NUM_WARPS, device):
         b.stride(0), b.stride(1),  #
         output.stride(0), output.stride(1),  #
         BLOCK_SIZE_M=BLOCK_M, BLOCK_SIZE_N=BLOCK_N, BLOCK_SIZE_K=BLOCK_K,  #
-        GROUP_SIZE_M=8, NUM_SMS=NUM_SMS, num_stages=NUM_STAGES, num_warps=NUM_WARPS)
+        GROUP_SIZE_M=8, NUM_SMS=NUM_SMS, DISALLOW_ACC_MULTI_BUFFER=DISALLOW_ACC_MULTI_BUFFER, num_stages=NUM_STAGES,
+        num_warps=NUM_WARPS)
     ref_out = torch.matmul(a.to(torch.float32), b.to(torch.float32)).to(torch.float16)
 
     torch.testing.assert_close(ref_out, output, atol=0.01, rtol=0.01)
@@ -250,8 +253,8 @@ def test_simple_persistent_matmul(BLOCK_M, BLOCK_N, BLOCK_K, NUM_WARPS, device):
     if (device == "cuda" and torch.cuda.get_device_capability()[0] == 10 and BLOCK_M % 64 == 0 and BLOCK_N % 8 == 0
             and BLOCK_N > 16):
         ttgir = k.asm["ttgir"]
-        pattern = (r"ttng.wait_barrier %arg")
-        assert re.search(pattern, str(ttgir)), "The TTGIR does not match the expected pattern."
+        pattern = "ttng.wait_barrier %arg"
+        assert ttgir.count(pattern) > 0, "Expect barrier coming from the previous iteration."
 
 
 @triton.jit
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -998,6 +998,7 @@ def visit_For(self, node):
             return
         num_stages = None
         loop_unroll_factor = None
+        disallow_acc_multi_buffer = False
         flatten = None
         if IteratorClass is language.range:
             iterator = IteratorClass(*iter_args, **iter_kwargs)
@@ -1009,6 +1010,7 @@ def visit_For(self, node):
             step = iterator.step
             num_stages = iterator.num_stages
             loop_unroll_factor = iterator.loop_unroll_factor
+            disallow_acc_multi_buffer = iterator.disallow_acc_multi_buffer
             flatten = iterator.flatten
         elif IteratorClass is range:
             # visit iterator arguments
@@ -1084,6 +1086,8 @@ def visit_For(self, node):
                 for_op.set_attr("tt.num_stages", self.builder.get_int32_attr(num_stages))
             if loop_unroll_factor is not None:
                 for_op.set_attr("tt.loop_unroll_factor", self.builder.get_int32_attr(loop_unroll_factor))
+            if disallow_acc_multi_buffer:
+                for_op.set_attr("tt.disallow_acc_multi_buffer", self.builder.get_unit_attr())
             if flatten:
                 for_op.set_attr("tt.flatten", self.builder.get_unit_attr())
 
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -2865,12 +2865,15 @@ def kernel(...):
     :param loop_unroll_factor: Tells the Triton IR level loop unroller how many
         times to unroll a for loop that this range is used with. Less than 2 for
         this value implies no unrolling.
+    :param disallow_acc_multi_buffer: If true, prevent the accumulator of the dot
+        operation in the loop to be multi-buffered, if applicable.
     :param flatten: automatically flatten the loop nest starting at this loop to
         create a single flattened loop. The compiler will try to pipeline the
         flattened loop which can avoid stage stalling.
     """
 
-    def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_factor=None, flatten=None):
+    def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_factor=None,
+                 disallow_acc_multi_buffer=False, flatten=None):
         if step is None:
             self.step = constexpr(1)
         else:
@@ -2883,6 +2886,7 @@ def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_fact
             self.end = arg2
         self.num_stages = num_stages
         self.loop_unroll_factor = loop_unroll_factor
+        self.disallow_acc_multi_buffer = disallow_acc_multi_buffer
         self.flatten = flatten
 
     def __iter__(self):
diff --git a/test/TritonGPU/mma-pipeline-blackwell.mlir b/test/TritonGPU/mma-pipeline-blackwell.mlir