[BACKEND] Hoist tmem alloc outside of if (#7568)

ThomasRaoux · web-flow · commit 96e91d49f1fa · 2025-07-19T02:40:03.000Z
In case a tmem alloc is used outside an if hoisting the alloc allows to
avoid creating large amount of registers created at the end of a block.
diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
@@ -148,6 +148,94 @@ class SinkTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
   }
 };
 
+// Combine back TMEM alloc and store. This is equivalent but gives us a more
+// canonical form to do further optimizations.
+class CombineTMEMStoreAndAlloc : public OpRewritePattern<TMEMTokenStoreOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
+                                PatternRewriter &rewriter) const override {
+    if (!matchPattern(store.getPred(), m_One()))
+      return failure();
+    auto alloc = store.getDep().getDefiningOp<TMEMTokenAllocOp>();
+    if (!alloc)
+      return failure();
+    if (alloc->getBlock() != store->getBlock())
+      return failure();
+    alloc.getSrcMutable().assign(store.getSrc());
+    rewriter.replaceOp(store, alloc.getToken());
+    return success();
+  }
+};
+
+// Hoists a tmem alloc outside an if op like this:
+// %0 = scf.if {
+//   %1, %token0 = tmem.alloc %init
+//   ...
+//   %2 = tmem.load %1, %token1
+//   scf.yield %2
+// } else {
+//   scf.yield %init
+// }
+// ->
+// %a, %token0 = tmem.alloc %init
+// %token2 = scf.if {
+//
+//   ...
+//   scf.yield %token1
+// } else {
+//   scf.yield %token0
+// }
+// %2 = tmem.load %a, %token2
+class HoistTMEMAllocOutOfIf : public OpRewritePattern<ttng::TMEMAllocOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ttng::TMEMAllocOp alloc,
+                                PatternRewriter &rewriter) const override {
+    if (!alloc.getToken())
+      return failure();
+    Value init = alloc.getSrc();
+    if (!init)
+      return failure();
+    auto ifOp = dyn_cast<scf::IfOp>(alloc->getParentOp());
+    if (!ifOp)
+      return failure();
+    auto thenOp = ifOp.thenBlock()->getTerminator();
+    auto elseOp = ifOp.elseBlock()->getTerminator();
+    SmallVector<int> yieldArgs;
+    for (auto [thenOperand, elseOperand] :
+         llvm::zip(thenOp->getOpOperands(), elseOp->getOpOperands())) {
+      auto load = thenOperand.get().getDefiningOp<TMEMTokenLoadOp>();
+      if (!load || load.getSrc() != alloc.getResult())
+        continue;
+      if (elseOperand.get() != init)
+        continue;
+      yieldArgs.push_back(thenOperand.getOperandNumber());
+    }
+    if (yieldArgs.empty())
+      return failure();
+    // Since init is used in the else terminator we know that it dominates the
+    // if op.
+    alloc->moveBefore(ifOp);
+    rewriter.setInsertionPointAfter(ifOp);
+    for (int argNo : yieldArgs) {
+      auto load =
+          cast<TMEMTokenLoadOp>(thenOp->getOperand(argNo).getDefiningOp());
+      auto newLoad = cast<TMEMTokenLoadOp>(rewriter.clone(*load));
+      rewriter.modifyOpInPlace(ifOp, [&] {
+        ifOp->getResult(argNo).replaceAllUsesWith(newLoad.getResult());
+        newLoad.getDepMutable().assign(ifOp->getResult(argNo));
+        thenOp->setOperand(argNo, load.getToken());
+        elseOp->setOperand(argNo, alloc.getToken());
+        ifOp->getResult(argNo).setType(newLoad.getToken().getType());
+      });
+    }
+    return success();
+  }
+};
+
 // Remove loop-carried tensor dependencies if they are fed immediately into a
 // TMEM store by pulling the store into the previous iteration.
 class RotateTMEMStoreInLoop : public OpRewritePattern<TMEMTokenStoreOp> {
@@ -412,11 +500,29 @@ struct HoistTMEMAlloc
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<RotateTMEMStoreInLoop, RotateTMEMLoadInLoop,
                  CombineTMEMLoadAndStore, CombineTMEMStoreAndSelect,
-                 SinkTMEMLoad, RemoveUnusedTMEMLoad>(&getContext());
+                 SinkTMEMLoad, RemoveUnusedTMEMLoad, CombineTMEMStoreAndAlloc,
+                 HoistTMEMAllocOutOfIf>(&getContext());
     scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       llvm_unreachable("Failed to hoist tmem_store");
     }
+
+    // TODO: currently some code assumes that a mutable tmem alloc doesn't have
+    // an initial value. As a workaround we break up the op in order to keep
+    // this form for the downstream passes. We should remove this once the
+    // downstread passes are fixed.
+    m.walk([&](ttng::TMEMAllocOp alloc) {
+      if (alloc.getType().getMutableMemory() && alloc.getSrc()) {
+        OpBuilder builder(alloc);
+        builder.setInsertionPointAfter(alloc);
+        auto store = builder.create<ttng::TMEMStoreOp>(
+            alloc.getLoc(), builder.getType<AsyncTokenType>(),
+            alloc.getResult(), alloc.getToken(), alloc.getSrc(),
+            builder.create<arith::ConstantIntOp>(alloc.getLoc(), 1, 1));
+        alloc.getToken().replaceAllUsesExcept(store.getToken(), store);
+        alloc.getSrcMutable().clear();
+      }
+    });
   }
 };
 
diff --git a/test/TritonGPU/hoist-tmem-alloc.mlir b/test/TritonGPU/hoist-tmem-alloc.mlir
@@ -307,3 +307,32 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %res_f16 : tensor<128x128xf16, #blocked>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @hoist_out_of_if
+  tt.func public @hoist_out_of_if(%arg0: i1, %arg1: tensor<128x128xf32, #blocked>) -> tensor<128x128xf32, #blocked> {
+    // CHECK: %[[A:.+]], %[[T0:.+]] = ttng.tmem_alloc : ()
+    // CHECK: %[[T1:.+]] = ttng.tmem_store %{{.*}}, %[[A]][%[[T0]]]
+    // CHECK: %[[I:.+]] = scf.if %{{.+}} -> (!ttg.async.token) {
+    // CHECK:   %[[T2:.+]] = "write_to_tmem"
+    // CHECK:   scf.yield %[[T2]]
+    // CHECK: } else {
+    // CHECK:   scf.yield %[[T1]]
+    // CHECK: }
+    // CHECK: %[[L:.+]], %[[T4:.+]] = ttng.tmem_load %[[A]][%[[I]]
+    // CHECK: tt.return %[[L]]
+    %0 = scf.if %arg0 -> (tensor<128x128xf32, #blocked>) {
+      %result, %token = ttng.tmem_alloc %arg1 : (tensor<128x128xf32, #blocked>) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+      %1 = "write_to_tmem"(%result) : (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) -> !ttg.async.token
+      %result_0, %token_1 = ttng.tmem_load %result[%1] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      scf.yield %result_0 : tensor<128x128xf32, #blocked>
+    } else {
+      scf.yield %arg1 : tensor<128x128xf32, #blocked>
+    }
+    tt.return %0 : tensor<128x128xf32, #blocked>
+  }
+}