[BACKEND] Make sure tmem load sink pattern converges (#7627)

ThomasRaoux · web-flow · commit 7a9c0049b1b2 · 2025-07-23T19:47:22.000Z
When we have multiple tmem load being sinked to the same point we could
run into an infinite loop and have the pattern not converge.
diff --git a/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp b/lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp
@@ -141,7 +141,13 @@ class SinkTMEMLoad : public OpRewritePattern<ttng::TMEMLoadOp> {
           return postDomInfo.properlyPostDominates(use->getOwner(), domOp);
         }))
       return failure();
-    if (domOp == load->getNextNode()) {
+    // In order to not re-ordering multiple tmem load in a loop, don't sink if
+    // all the ops between the load and the domOp are tmem loads.
+    Operation *nextNode = load->getNextNode();
+    while (auto tmemLoad = dyn_cast<ttng::TMEMLoadOp>(nextNode)) {
+      nextNode = tmemLoad->getNextNode();
+    }
+    if (domOp == nextNode) {
       // The load wasn't moved.
       return failure();
     }
diff --git a/test/TritonGPU/hoist-tmem-alloc.mlir b/test/TritonGPU/hoist-tmem-alloc.mlir
@@ -353,3 +353,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %result1, %token1 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @sink_multiple_tmem_load
+  tt.func public @sink_multiple_tmem_load(%m: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, %t: !ttg.async.token) -> (tensor<128x128xf32, #blocked>, tensor<128x128xf32, #blocked>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c2_i32 = arith.constant 2 : i32
+    %res:2 = scf.for %i = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%init0 = %cst, %init1 = %cst) -> (tensor<128x128xf32, #blocked>, tensor<128x128xf32, #blocked>)  : i32 {
+      // Any order is fine, just make sure we don't reorder them in an infinite loop.
+      // CHECK-COUNT-2: ttng.tmem_load
+      // CHECK: scf.yield
+      %l0, %token_1 = ttng.tmem_load %m[%t] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      %l1, %token_2 = ttng.tmem_load %m[%t] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      scf.yield %l0, %l1 : tensor<128x128xf32, #blocked>, tensor<128x128xf32, #blocked>
+    } {tt.scheduled_max_stage = 3 : i32}
+    tt.return %res#0, %res#1 : tensor<128x128xf32, #blocked>, tensor<128x128xf32, #blocked>
+  }
+}