[Blackwell] Handle control flow in TMEM allocation (#7698)

Mogball · web-flow · commit d82cfd3211bc · 2025-07-30T23:43:10.000-07:00
todo: write unit test
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp
@@ -1,9 +1,13 @@
 #include "mlir/Analysis/Liveness.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 #include "triton/Analysis/Allocation.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Traits.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -175,30 +179,92 @@ static TMemChunk allocFirstFit(MemoryBitMap &memoryMap,
   return chunk;
 }
 
-static Operation *getAlloc(Value value) {
-  while (true) {
-    if (auto allocOp = value.getDefiningOp<TMEMAllocOp>())
-      return allocOp;
-    if (auto indexOp = value.getDefiningOp<ttg::MemDescIndexOp>()) {
-      value = indexOp.getSrc();
+static SmallVector<Operation *> getAlloc(Value value) {
+  SmallVector<Operation *> allocs;
+  DenseSet<Value> seen;
+  SmallVector<Value> worklist{value};
+
+  while (!worklist.empty()) {
+    Value v = worklist.pop_back_val();
+    if (!seen.insert(v).second)
       continue;
-    }
-    if (auto reinterpOp = value.getDefiningOp<ttg::MemDescReinterpretOp>()) {
-      value = reinterpOp.getSrc();
+
+    // Handle block arguments.
+    if (auto arg = dyn_cast<BlockArgument>(v)) {
+      Block *block = arg.getOwner();
+      Operation *parentOp = block->getParentOp();
+
+      // Handle block with predecessors.
+      if (!block->isEntryBlock()) {
+        for (Block *pred : block->getPredecessors()) {
+          Operation *predOp = pred->getTerminator();
+          auto br = dyn_cast<BranchOpInterface>(predOp);
+          if (!br) {
+            llvm::report_fatal_error("unhandled branch op: " +
+                                     predOp->getName().getStringRef());
+          }
+          SmallVector<Attribute> operands(br->getNumOperands());
+          auto it = llvm::find(br->getSuccessors(), block);
+          unsigned idx = std::distance(br->getSuccessors().begin(), it);
+          SuccessorOperands args = br.getSuccessorOperands(idx);
+          Value operand =
+              args.getForwardedOperands()[arg.getArgNumber() -
+                                          args.getProducedOperandCount()];
+          worklist.push_back(operand);
+        }
+        continue;
+      }
+
+      // Handle region entry arguments.
+      if (auto wsOp = dyn_cast<ttg::WarpSpecializePartitionsOp>(parentOp)) {
+        worklist.push_back(
+            wsOp.getParentOp().getExplicitCaptures()[arg.getArgNumber()]);
+      } else if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
+        unsigned idx = arg.getArgNumber() - 1;
+        worklist.push_back(forOp.getYieldedValues()[idx]);
+        worklist.push_back(forOp.getInits()[idx]);
+      } else if (auto whileOp = dyn_cast<scf::WhileOp>(parentOp)) {
+        unsigned idx = arg.getArgNumber();
+        if (arg.getParentRegion() == &whileOp.getAfter()) {
+          worklist.push_back(whileOp.getConditionOp().getArgs()[idx]);
+        } else {
+          worklist.push_back(whileOp.getYieldedValues()[idx]);
+          worklist.push_back(whileOp.getInits()[idx]);
+        }
+      } else {
+        llvm::report_fatal_error(
+            "unhandled parent op when looking for TMEM alloc: " +
+            parentOp->getName().getStringRef());
+      }
       continue;
     }
-    if (auto slice = value.getDefiningOp<TMEMSubSliceOp>()) {
-      value = slice.getSrc();
-      continue;
+
+    Operation *defOp = v.getDefiningOp();
+    unsigned idx = cast<OpResult>(v).getResultNumber();
+    if (isa<TMEMAllocOp>(defOp)) {
+      allocs.push_back(defOp);
+    } else if (defOp->hasTrait<OpTrait::MemDescViewTrait>()) {
+      worklist.push_back(defOp->getOperand(0));
+    } else if (auto sliceOp = dyn_cast<TMEMSubSliceOp>(defOp)) {
+      worklist.push_back(sliceOp.getSrc());
+    } else if (auto selectOp = dyn_cast<arith::SelectOp>(defOp)) {
+      worklist.push_back(selectOp.getTrueValue());
+      worklist.push_back(selectOp.getFalseValue());
+    } else if (auto ifOp = dyn_cast<scf::IfOp>(defOp)) {
+      worklist.push_back(ifOp.thenYield().getOperand(idx));
+      worklist.push_back(ifOp.elseYield().getOperand(idx));
+    } else if (auto forOp = dyn_cast<scf::ForOp>(defOp)) {
+      worklist.push_back(forOp.getYieldedValues()[idx]);
+      worklist.push_back(forOp.getInits()[idx]);
+    } else if (auto whileOp = dyn_cast<scf::WhileOp>(defOp)) {
+      worklist.push_back(whileOp.getConditionOp().getArgs()[idx]);
+    } else {
+      llvm::report_fatal_error("unhandled op when looking for TMEM alloc: " +
+                               defOp->getName().getStringRef());
     }
-    auto arg = dyn_cast<BlockArgument>(value);
-    if (!arg || !isa<triton::gpu::WarpSpecializePartitionsOp>(
-                    arg.getOwner()->getParentOp()))
-      llvm::report_fatal_error("expected to find a TMEM alloc op");
-    auto partitions = cast<triton::gpu::WarpSpecializePartitionsOp>(
-        arg.getOwner()->getParentOp());
-    value = partitions.getParentOp().getExplicitCaptures()[arg.getArgNumber()];
   }
+
+  return allocs;
 }
 
 class RowIdConstraints {
@@ -245,8 +311,11 @@ allocateTMem(Operation *parentOp,
         if (allocSize.numRows == 64) {
           // HW restriction, the A alloc and accumulator needs to be in the same
           // rows.
-          rowIdConstraints.joinOps(getAlloc(mmaOp.getA()),
-                                   getAlloc(mmaOp.getAccumulator()));
+          SmallVector<Operation *> lhsAllocs = getAlloc(mmaOp.getA());
+          SmallVector<Operation *> accAllocs = getAlloc(mmaOp.getAccumulator());
+          for (Operation *lhsAlloc : lhsAllocs)
+            for (Operation *accAlloc : accAllocs)
+              rowIdConstraints.joinOps(lhsAlloc, accAlloc);
         } else {
           // TODO: we need to handle cases where the format is blockM and we
           // have multiple blocks.
diff --git a/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir b/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir
@@ -350,3 +350,46 @@ tt.func @alloc_warp_specialize_explicit_capture() {
 }
 
 }
+
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true, elementBitWidth = 8}>
+#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 64, blockN = 64, unpacked = true>
+#tmem_scales = #ttng.tensor_memory_scales_encoding<>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65536 : i32} {
+
+// CHECK-LABEL: @mma_lhs_tmem
+tt.func @mma_lhs_tmem(
+  %b: !ttg.memdesc<64x64xf16, #shared1, #ttg.shared_memory>,
+  %useAcc: i1,
+  %pred: i1,
+  %barrier: !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>,
+  %barrierPred: i1
+) {
+  // CHECK-COUNT-4: ttng.tmem_alloc {{.*}} tensor_memory_row_offset = 0 : i32
+  // CHECK-NOT: tensor_memory_row_offset
+  %a0 = ttng.tmem_alloc : () -> !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>
+  %a1 = ttng.tmem_alloc : () -> !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>
+  %a2 = ttng.tmem_alloc : () -> !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>
+  %c = ttng.tmem_alloc : () -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+
+  %a = arith.select %barrierPred, %a0, %a1 : !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>
+
+  cf.cond_br %barrierPred, ^switch, ^bb1(%a : !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>)
+
+^switch:
+  cf.br ^bb1(%a2 : !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>)
+
+^bb1(%lhs: !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>):
+  ttng.tc_gen5_mma %lhs, %b, %c, %useAcc, %pred, %barrier[%barrierPred] {is_async} :
+    !ttg.memdesc<64x64xf16, #tmem, #ttng.tensor_memory, mutable>,
+    !ttg.memdesc<64x64xf16, #shared1, #ttg.shared_memory>,
+    !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>,
+    !ttg.memdesc<1xi64, #shared2, #ttg.shared_memory>
+  tt.return
+}
+
+}