Revert "[BACKEND] Hoist tmem alloc outside of if (#7568)" (#7597)

lezcano · web-flow · commit d0642aab963a · 2025-07-22T13:53:20.000Z
This reverts commit 96e91d4.
diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
@@ -148,94 +148,6 @@ class SinkTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
   }
 };
 
-// Combine back TMEM alloc and store. This is equivalent but gives us a more
-// canonical form to do further optimizations.
-class CombineTMEMStoreAndAlloc : public OpRewritePattern<TMEMTokenStoreOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
-                                PatternRewriter &rewriter) const override {
-    if (!matchPattern(store.getPred(), m_One()))
-      return failure();
-    auto alloc = store.getDep().getDefiningOp<TMEMTokenAllocOp>();
-    if (!alloc)
-      return failure();
-    if (alloc->getBlock() != store->getBlock())
-      return failure();
-    alloc.getSrcMutable().assign(store.getSrc());
-    rewriter.replaceOp(store, alloc.getToken());
-    return success();
-  }
-};
-
-// Hoists a tmem alloc outside an if op like this:
-// %0 = scf.if {
-//   %1, %token0 = tmem.alloc %init
-//   ...
-//   %2 = tmem.load %1, %token1
-//   scf.yield %2
-// } else {
-//   scf.yield %init
-// }
-// ->
-// %a, %token0 = tmem.alloc %init
-// %token2 = scf.if {
-//
-//   ...
-//   scf.yield %token1
-// } else {
-//   scf.yield %token0
-// }
-// %2 = tmem.load %a, %token2
-class HoistTMEMAllocOutOfIf : public OpRewritePattern<ttng::TMEMAllocOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ttng::TMEMAllocOp alloc,
-                                PatternRewriter &rewriter) const override {
-    if (!alloc.getToken())
-      return failure();
-    Value init = alloc.getSrc();
-    if (!init)
-      return failure();
-    auto ifOp = dyn_cast<scf::IfOp>(alloc->getParentOp());
-    if (!ifOp)
-      return failure();
-    auto thenOp = ifOp.thenBlock()->getTerminator();
-    auto elseOp = ifOp.elseBlock()->getTerminator();
-    SmallVector<int> yieldArgs;
-    for (auto [thenOperand, elseOperand] :
-         llvm::zip(thenOp->getOpOperands(), elseOp->getOpOperands())) {
-      auto load = thenOperand.get().getDefiningOp<TMEMTokenLoadOp>();
-      if (!load || load.getSrc() != alloc.getResult())
-        continue;
-      if (elseOperand.get() != init)
-        continue;
-      yieldArgs.push_back(thenOperand.getOperandNumber());
-    }
-    if (yieldArgs.empty())
-      return failure();
-    // Since init is used in the else terminator we know that it dominates the
-    // if op.
-    alloc->moveBefore(ifOp);
-    rewriter.setInsertionPointAfter(ifOp);
-    for (int argNo : yieldArgs) {
-      auto load =
-          cast<TMEMTokenLoadOp>(thenOp->getOperand(argNo).getDefiningOp());
-      auto newLoad = cast<TMEMTokenLoadOp>(rewriter.clone(*load));
-      rewriter.modifyOpInPlace(ifOp, [&] {
-        ifOp->getResult(argNo).replaceAllUsesWith(newLoad.getResult());
-        newLoad.getDepMutable().assign(ifOp->getResult(argNo));
-        thenOp->setOperand(argNo, load.getToken());
-        elseOp->setOperand(argNo, alloc.getToken());
-        ifOp->getResult(argNo).setType(newLoad.getToken().getType());
-      });
-    }
-    return success();
-  }
-};
-
 // Remove loop-carried tensor dependencies if they are fed immediately into a
 // TMEM store by pulling the store into the previous iteration.
 class RotateTMEMStoreInLoop : public OpRewritePattern<TMEMTokenStoreOp> {
@@ -500,29 +412,11 @@ struct HoistTMEMAlloc
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<RotateTMEMStoreInLoop, RotateTMEMLoadInLoop,
                  CombineTMEMLoadAndStore, CombineTMEMStoreAndSelect,
-                 SinkTMEMLoad, RemoveUnusedTMEMLoad, CombineTMEMStoreAndAlloc,
-                 HoistTMEMAllocOutOfIf>(&getContext());
+                 SinkTMEMLoad, RemoveUnusedTMEMLoad>(&getContext());
     scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       llvm_unreachable("Failed to hoist tmem_store");
     }
-
-    // TODO: currently some code assumes that a mutable tmem alloc doesn't have
-    // an initial value. As a workaround we break up the op in order to keep
-    // this form for the downstream passes. We should remove this once the
-    // downstread passes are fixed.
-    m.walk([&](ttng::TMEMAllocOp alloc) {
-      if (alloc.getType().getMutableMemory() && alloc.getSrc()) {
-        OpBuilder builder(alloc);
-        builder.setInsertionPointAfter(alloc);
-        auto store = builder.create<ttng::TMEMStoreOp>(
-            alloc.getLoc(), builder.getType<AsyncTokenType>(),
-            alloc.getResult(), alloc.getToken(), alloc.getSrc(),
-            builder.create<arith::ConstantIntOp>(alloc.getLoc(), 1, 1));
-        alloc.getToken().replaceAllUsesExcept(store.getToken(), store);
-        alloc.getSrcMutable().clear();
-      }
-    });
   }
 };
 
diff --git a/test/TritonGPU/hoist-tmem-alloc.mlir b/test/TritonGPU/hoist-tmem-alloc.mlir
@@ -307,32 +307,3 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %res_f16 : tensor<128x128xf16, #blocked>
   }
 }
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
-#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK-LABEL: @hoist_out_of_if
-  tt.func public @hoist_out_of_if(%arg0: i1, %arg1: tensor<128x128xf32, #blocked>) -> tensor<128x128xf32, #blocked> {
-    // CHECK: %[[A:.+]], %[[T0:.+]] = ttng.tmem_alloc : ()
-    // CHECK: %[[T1:.+]] = ttng.tmem_store %{{.*}}, %[[A]][%[[T0]]]
-    // CHECK: %[[I:.+]] = scf.if %{{.+}} -> (!ttg.async.token) {
-    // CHECK:   %[[T2:.+]] = "write_to_tmem"
-    // CHECK:   scf.yield %[[T2]]
-    // CHECK: } else {
-    // CHECK:   scf.yield %[[T1]]
-    // CHECK: }
-    // CHECK: %[[L:.+]], %[[T4:.+]] = ttng.tmem_load %[[A]][%[[I]]
-    // CHECK: tt.return %[[L]]
-    %0 = scf.if %arg0 -> (tensor<128x128xf32, #blocked>) {
-      %result, %token = ttng.tmem_alloc %arg1 : (tensor<128x128xf32, #blocked>) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
-      %1 = "write_to_tmem"(%result) : (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) -> !ttg.async.token
-      %result_0, %token_1 = ttng.tmem_load %result[%1] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
-      scf.yield %result_0 : tensor<128x128xf32, #blocked>
-    } else {
-      scf.yield %arg1 : tensor<128x128xf32, #blocked>
-    }
-    tt.return %0 : tensor<128x128xf32, #blocked>
-  }
-}