[BACKEND] Recommit tmem alloc hoisting out of if (#7605)

ThomasRaoux · web-flow · commit ab4a29aece39 · 2025-07-22T21:23:08.000-07:00
Change the flow to only apply the hoisting out of if after pipelining
and warp specialization as this transformation may block muli-buffering.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -60,6 +60,11 @@ def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp
                            "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
                            "mlir::scf::SCFDialect",
                            "mlir::arith::ArithDialect"];
+  let options = [
+    Option<"hoistOutOfIf", "hoist-out-of-if",
+           "bool", /*default*/"false",
+           "Hoist TMEM allocations out of if statements">
+  ];
 }
 
 def TritonGPUTestPipelineLowerLoop : Pass<"tritongpu-test-pipeline-lower-loop", "mlir::ModuleOp"> {
diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
@@ -38,12 +38,14 @@ using TMEMTokenLoadOp = HasToken<ttng::TMEMLoadOp>;
 using TMEMTokenStoreOp = HasToken<ttng::TMEMStoreOp>;
 using TMEMTokenAllocOp = HasToken<ttng::TMEMAllocOp>;
 
-class CombineTMEMStoreAndSelect : public OpRewritePattern<TMEMTokenStoreOp> {
+class CombineTMEMStoreAndSelect : public OpRewritePattern<ttng::TMEMStoreOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
+  LogicalResult matchAndRewrite(ttng::TMEMStoreOp store,
                                 PatternRewriter &rewriter) const override {
+    if (!store.getDep())
+      return failure();
     Value src = store.getSrc();
     auto select = src.getDefiningOp<arith::SelectOp>();
     if (!select) {
@@ -79,12 +81,14 @@ class CombineTMEMStoreAndSelect : public OpRewritePattern<TMEMTokenStoreOp> {
   }
 };
 
-class RemoveUnusedTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
+class RemoveUnusedTMEMLoad : public OpRewritePattern<ttng::TMEMLoadOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenLoadOp load,
+  LogicalResult matchAndRewrite(ttng::TMEMLoadOp load,
                                 PatternRewriter &rewriter) const override {
+    if (!load.getDep())
+      return failure();
     if (!load.getResult().use_empty())
       return failure();
     rewriter.replaceAllUsesWith(load.getToken(), load.getDep());
@@ -93,12 +97,14 @@ class RemoveUnusedTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
 };
 
 // Load-store forwarding pattern.
-class CombineTMEMLoadAndStore : public OpRewritePattern<TMEMTokenStoreOp> {
+class CombineTMEMLoadAndStore : public OpRewritePattern<ttng::TMEMStoreOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
+  LogicalResult matchAndRewrite(ttng::TMEMStoreOp store,
                                 PatternRewriter &rewriter) const override {
+    if (!store.getDep())
+      return failure();
     auto load = store.getDep().getDefiningOp<HasToken<ttng::TMEMLoadOp>>();
     if (!load || load.getResult() != store.getSrc() ||
         load.getSrc() != store.getDst())
@@ -108,12 +114,14 @@ class CombineTMEMLoadAndStore : public OpRewritePattern<TMEMTokenStoreOp> {
   }
 };
 
-class SinkTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
+class SinkTMEMLoad : public OpRewritePattern<ttng::TMEMLoadOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenLoadOp load,
+  LogicalResult matchAndRewrite(ttng::TMEMLoadOp load,
                                 PatternRewriter &rewriter) const override {
+    if (!load.getDep())
+      return failure();
     auto forOp = load->getParentOfType<scf::ForOp>();
     if (!forOp) {
       return failure();
@@ -148,14 +156,130 @@ class SinkTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
   }
 };
 
+// Combine back TMEM alloc and store. This is equivalent but gives us a more
+// canonical form to do further optimizations.
+class CombineTMEMStoreAndAlloc : public OpRewritePattern<ttng::TMEMStoreOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ttng::TMEMStoreOp store,
+                                PatternRewriter &rewriter) const override {
+    if (!store.getDep())
+      return failure();
+    if (!matchPattern(store.getPred(), m_One()))
+      return failure();
+    auto alloc = store.getDep().getDefiningOp<TMEMTokenAllocOp>();
+    if (!alloc)
+      return failure();
+    if (store.getSrc() != alloc.getResult())
+      return failure();
+    if (alloc->getBlock() != store->getBlock())
+      return failure();
+    alloc.getSrcMutable().assign(store.getSrc());
+    rewriter.replaceOp(store, alloc.getToken());
+    return success();
+  }
+};
+
+// Hoists a tmem alloc outside an if op like this:
+// %0 = scf.if {
+//   %1, %token0 = tmem.alloc %init
+//   ...
+//   %2 = tmem.load %1, %token1
+//   scf.yield %2
+// } else {
+//   scf.yield %init
+// }
+// ->
+// %a, %token0 = tmem.alloc %init
+// %token2 = scf.if {
+//
+//   ...
+//   scf.yield %token1
+// } else {
+//   scf.yield %token0
+// }
+// %2 = tmem.load %a, %token2
+class HoistTMEMAllocOutOfIf : public OpRewritePattern<ttng::TMEMAllocOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ttng::TMEMAllocOp alloc,
+                                PatternRewriter &rewriter) const override {
+    if (!alloc.getToken())
+      return failure();
+    Value init = alloc.getSrc();
+    if (!init)
+      return failure();
+    auto ifOp = dyn_cast<scf::IfOp>(alloc->getParentOp());
+    if (!ifOp || !ifOp.elseBlock())
+      return failure();
+    auto thenOp = ifOp.thenBlock()->getTerminator();
+    auto elseOp = ifOp.elseBlock()->getTerminator();
+    SmallVector<int> yieldArgs;
+    for (auto [thenOperand, elseOperand] :
+         llvm::zip(thenOp->getOpOperands(), elseOp->getOpOperands())) {
+      auto load = thenOperand.get().getDefiningOp<TMEMTokenLoadOp>();
+      if (!load || load.getSrc() != alloc.getResult())
+        continue;
+      if (elseOperand.get() != init)
+        continue;
+      yieldArgs.push_back(thenOperand.getOperandNumber());
+    }
+    if (yieldArgs.empty())
+      return failure();
+    // Since init is used in the else terminator we know that it dominates the
+    // if op.
+    alloc->moveBefore(ifOp);
+    rewriter.setInsertionPointAfter(ifOp);
+    for (int argNo : yieldArgs) {
+      auto load =
+          cast<TMEMTokenLoadOp>(thenOp->getOperand(argNo).getDefiningOp());
+      auto newLoad = cast<TMEMTokenLoadOp>(rewriter.clone(*load));
+      rewriter.modifyOpInPlace(ifOp, [&] {
+        ifOp->getResult(argNo).replaceAllUsesWith(newLoad.getResult());
+        newLoad.getDepMutable().assign(ifOp->getResult(argNo));
+        thenOp->setOperand(argNo, load.getToken());
+        elseOp->setOperand(argNo, alloc.getToken());
+        ifOp->getResult(argNo).setType(newLoad.getToken().getType());
+      });
+    }
+    return success();
+  }
+};
+
+// Forward a TMEM load into the user allocation.
+class TMEMLoadForwarding : public OpRewritePattern<ttng::TMEMAllocOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ttng::TMEMAllocOp alloc,
+                                PatternRewriter &rewriter) const override {
+    if (!alloc.getToken())
+      return failure();
+    Value init = alloc.getSrc();
+    if (!init)
+      return failure();
+    auto load = init.getDefiningOp<TMEMTokenLoadOp>();
+    if (!load || !load->hasOneUse() || !load.getDep().hasOneUse())
+      return failure();
+    if (alloc.getType() != load.getSrc().getType())
+      return failure();
+    rewriter.replaceOp(alloc, {load.getSrc(), load.getDep()});
+    return success();
+  }
+};
+
 // Remove loop-carried tensor dependencies if they are fed immediately into a
 // TMEM store by pulling the store into the previous iteration.
-class RotateTMEMStoreInLoop : public OpRewritePattern<TMEMTokenStoreOp> {
+class RotateTMEMStoreInLoop : public OpRewritePattern<ttng::TMEMStoreOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
+  LogicalResult matchAndRewrite(ttng::TMEMStoreOp store,
                                 PatternRewriter &rewriter) const override {
+    if (!store.getDep())
+      return failure();
     // Pattern match stores whose source comes from a loop region argument and
     // whose predicate is loop-invariant.
     scf::ForOp forOp = dyn_cast<scf::ForOp>(store->getParentOp());
@@ -215,12 +339,14 @@ class RotateTMEMStoreInLoop : public OpRewritePattern<TMEMTokenStoreOp> {
 
 // Remove loop-carried tensor dependencies if they are the result of TMEM loads
 // at the end of the loop by pushing the load into the next iteration.
-class RotateTMEMLoadInLoop : public OpRewritePattern<TMEMTokenLoadOp> {
+class RotateTMEMLoadInLoop : public OpRewritePattern<ttng::TMEMLoadOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TMEMTokenLoadOp load,
+  LogicalResult matchAndRewrite(ttng::TMEMLoadOp load,
                                 PatternRewriter &rewriter) const override {
+    if (!load.getDep())
+      return failure();
     // Pattern match loads whose results are only passed into the next iteration
     // of a loop.
     scf::ForOp forOp = dyn_cast<scf::ForOp>(load->getParentOp());
@@ -391,32 +517,55 @@ struct HoistTMEMAlloc
 
   void runOnOperation() override {
     ModuleOp m = getOperation();
-    SmallVector<ttng::MMAv5OpInterface> mmaOps;
-    m.walk([&](ttng::MMAv5OpInterface mmaOp) { mmaOps.push_back(mmaOp); });
-    for (auto mmaOp : mmaOps) {
-      auto forOp = dyn_cast<scf::ForOp>(mmaOp->getParentOp());
-      if (!forOp) {
-        continue;
+    if (!hoistOutOfIf) {
+      SmallVector<ttng::MMAv5OpInterface> mmaOps;
+      m.walk([&](ttng::MMAv5OpInterface mmaOp) { mmaOps.push_back(mmaOp); });
+      for (auto mmaOp : mmaOps) {
+        auto forOp = dyn_cast<scf::ForOp>(mmaOp->getParentOp());
+        if (!forOp) {
+          continue;
+        }
+        hoistInvariantInputs(mmaOp, forOp);
+
+        // Only hoist the TMEM alloc feeding into the accumulator. Leave the
+        // ones for the scales in the loop.
+        auto alloc = mmaOp.getAccumulator().getDefiningOp<TMEMTokenAllocOp>();
+        if (!alloc || alloc->getParentRegion() != mmaOp->getParentRegion()) {
+          continue;
+        }
+        hoistTMEMAlloc(alloc, forOp);
       }
-      hoistInvariantInputs(mmaOp, forOp);
-
-      // Only hoist the TMEM alloc feeding into the accumulator. Leave the ones
-      // for the scales in the loop.
-      auto alloc = mmaOp.getAccumulator().getDefiningOp<TMEMTokenAllocOp>();
-      if (!alloc || alloc->getParentRegion() != mmaOp->getParentRegion()) {
-        continue;
-      }
-      hoistTMEMAlloc(alloc, forOp);
     }
 
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<RotateTMEMStoreInLoop, RotateTMEMLoadInLoop,
                  CombineTMEMLoadAndStore, CombineTMEMStoreAndSelect,
                  SinkTMEMLoad, RemoveUnusedTMEMLoad>(&getContext());
+    if (hoistOutOfIf) {
+      patterns.add<CombineTMEMStoreAndAlloc, HoistTMEMAllocOutOfIf,
+                   TMEMLoadForwarding>(&getContext());
+    }
     scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       llvm_unreachable("Failed to hoist tmem_store");
     }
+
+    // TODO: currently some code assumes that a mutable tmem alloc doesn't have
+    // an initial value. As a workaround we break up the op in order to keep
+    // this form for the downstream passes. We should remove this once the
+    // downstread passes are fixed.
+    m.walk([&](ttng::TMEMAllocOp alloc) {
+      if (alloc.getType().getMutableMemory() && alloc.getSrc()) {
+        OpBuilder builder(alloc);
+        builder.setInsertionPointAfter(alloc);
+        auto store = builder.create<ttng::TMEMStoreOp>(
+            alloc.getLoc(), builder.getType<AsyncTokenType>(),
+            alloc.getResult(), alloc.getToken(), alloc.getSrc(),
+            builder.create<arith::ConstantIntOp>(alloc.getLoc(), 1, 1));
+        alloc.getToken().replaceAllUsesExcept(store.getToken(), store);
+        alloc.getSrcMutable().clear();
+      }
+    });
   }
 };
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -894,11 +894,14 @@ void multibufferTensorMemory(scf::ForOp forOp, CoarseSchedule &schedule,
       llvm::to_vector(alloc.getResult().getUsers());
   Value replTok = OpBuilder(forOp).create<ub::PoisonOp>(
       forOp.getLoc(), builder.getType<AsyncTokenType>());
+  if (newAlloc.getToken()) {
+    newAlloc.getToken().replaceAllUsesWith(replTok);
+  }
   for (auto user : allocUsers) {
     if (auto store = dyn_cast<ttng::TMEMStoreOp>(user)) {
+      store.getDepMutable().clear();
+      store.getToken().replaceAllUsesWith(replTok);
       if (forOp->isAncestor(store)) {
-        store.getDepMutable().clear();
-        store.getToken().replaceAllUsesWith(replTok);
         // We can multibuffer, since the store is a point where we can
         // change the buffer index
         multibufferingIsValid = true;
@@ -926,9 +929,9 @@ void multibufferTensorMemory(scf::ForOp forOp, CoarseSchedule &schedule,
         store.getDstMutable().assign(tmemSlice);
       }
     } else if (auto load = dyn_cast<ttng::TMEMLoadOp>(user)) {
+      load.getDepMutable().clear();
+      load.getToken().replaceAllUsesWith(replTok);
       if (forOp->isAncestor(load)) {
-        load.getDepMutable().clear();
-        load.getToken().replaceAllUsesWith(replTok);
         builder.setStageCluster(schedule[load]);
         builder.setInsertionPoint(load);
         Value curBufIdx = getCurrBufIdx(load);
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -59,7 +59,8 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_coalesce", createTritonGPUCoalesce);
   ADD_PASS_WRAPPER_0("add_optimize_thread_locality",
                      createTritonGPUOptimizeThreadLocality);
-  ADD_PASS_WRAPPER_0("add_hoist_tmem_alloc", createTritonGPUHoistTMEMAlloc);
+  ADD_PASS_OPTION_WRAPPER_1("add_hoist_tmem_alloc",
+                            createTritonGPUHoistTMEMAlloc, bool);
   ADD_PASS_OPTION_WRAPPER_1("add_assign_latencies",
                             createTritonGPUAssignLatencies, int);
   ADD_PASS_WRAPPER_0("add_schedule_loops", createTritonGPUScheduleLoops);
diff --git a/test/TritonGPU/hoist-tmem-alloc.mlir b/test/TritonGPU/hoist-tmem-alloc.mlir
@@ -1,4 +1,5 @@
 // RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -tritongpu-hoist-tmem-alloc -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -tritongpu-hoist-tmem-alloc="hoist-out-of-if=true" -canonicalize | FileCheck %s -check-prefix=HOIST-IF
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
@@ -307,3 +308,48 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %res_f16 : tensor<128x128xf16, #blocked>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // HOIST-IF-LABEL: @hoist_out_of_if
+  tt.func public @hoist_out_of_if(%arg0: i1, %arg1: tensor<128x128xf32, #blocked>) -> tensor<128x128xf32, #blocked> {
+    // HOIST-IF: %[[A:.+]], %[[T0:.+]] = ttng.tmem_alloc : ()
+    // HOIST-IF: %[[T1:.+]] = ttng.tmem_store %{{.*}}, %[[A]][%[[T0]]]
+    // HOIST-IF: %[[I:.+]] = scf.if %{{.+}} -> (!ttg.async.token) {
+    // HOIST-IF:   %[[T2:.+]] = "write_to_tmem"
+    // HOIST-IF:   scf.yield %[[T2]]
+    // HOIST-IF: } else {
+    // HOIST-IF:   scf.yield %[[T1]]
+    // HOIST-IF: }
+    // HOIST-IF: %[[L:.+]], %[[T4:.+]] = ttng.tmem_load %[[A]][%[[I]]
+    // HOIST-IF: tt.return %[[L]]
+    %0 = scf.if %arg0 -> (tensor<128x128xf32, #blocked>) {
+      %result, %token = ttng.tmem_alloc %arg1 : (tensor<128x128xf32, #blocked>) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+      %1 = "write_to_tmem"(%result) : (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) -> !ttg.async.token
+      %result_0, %token_1 = ttng.tmem_load %result[%1] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      scf.yield %result_0 : tensor<128x128xf32, #blocked>
+    } else {
+      scf.yield %arg1 : tensor<128x128xf32, #blocked>
+    }
+    tt.return %0 : tensor<128x128xf32, #blocked>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @forward_tmem_load(%m: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, %t: !ttg.async.token) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token) {
+    %result, %token0 = ttng.tmem_load %m[%t] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+    // HOIST-IF-LABEL: @forward_tmem_load
+    // HOIST-IF-SAME:    %[[ARG0:.+]]: !ttg.memdesc<128x128xf32,
+    // HOIST-IF-SAME:    %[[ARG1:.+]]: !ttg.async.token
+    // HOIST-IF-NEXT:    tt.return %[[ARG0]], %[[ARG1]]
+    %result1, %token1 = ttng.tmem_alloc %result : (tensor<128x128xf32, #blocked>) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+    tt.return %result1, %token1 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
+  }
+}
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py