[Coalescing]: Support layout propagation through scf.if nested in a loop (#4868)

etiotto · Copilot · web-flow · commit 7e20e4819617 · 2025-08-11T21:02:23.000Z
This PR fixes layout propagation through scf.if operations nested within loops in the Triton Intel GPU coalescing pass. Fixes #4867 --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir
@@ -54,8 +54,7 @@ tt.func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
 // -----
 
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
-
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // CHECK: [[NARROW_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 // CHECK: [[WIDE_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
@@ -343,7 +342,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.threads-per-warp" = 32 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32} {
   // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}>
   // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
   // CHECK: @triton_red_fused_mul_sum_0
@@ -412,7 +411,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.th
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 16], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
-module attributes {ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 16], order = [1, 0]}>
   // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
   // CHECK-DAG: [[BLOCKED_LAYOUT2:#.*]] = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
@@ -474,7 +473,7 @@ module attributes {ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.n
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 1, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}>
 #blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 8, 4], warpsPerCTA = [1, 2, 1], order = [0, 1, 2]}>
-module attributes {ttig.min_sg_size = 16 : i32, ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32} {
   // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 4, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}>
   // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
   // CHECK-DAG: [[BLOCKED_LAYOUT2:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 1, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}>
@@ -587,3 +586,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     tt.return
   }
 }
+
+// -----
+
+// COM: Test layout propagation for nested operations (scf.if nested in scf.for).
+// COM: Reproducer for issue #4867
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+  // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+  // CHECK: @test_4867
+  tt.func public @test_4867(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i1) {
+    %c0_i32 = arith.constant 0 : i32
+    %c16_i32 = arith.constant 16 : i32
+    %c128_i64 = arith.constant 128 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<128x32xf32, #blocked>>
+    %1 = tt.make_tensor_ptr %arg1, [%c128_i64, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x128xf32, #blocked>>
+    %2:2 = scf.for %arg3 = %c0_i32 to %c32_i32 step %c32_i32 iter_args(%arg4 = %0, %arg5 = %1) -> (!tt.ptr<tensor<128x32xf32, #blocked>>, !tt.ptr<tensor<32x128xf32, #blocked>>)  : i32 {
+      // CHECK: scf.for {{.*}}
+      // CHECK-NOT: [[BLOCKED_LAYOUT]]>>
+      %adv = tt.advance %arg5, [%c32_i32, %c0_i32] : <tensor<32x128xf32, #blocked>>
+      %3:2 = scf.if %arg2 -> (!tt.ptr<tensor<32x128xf32, #blocked>>, !tt.ptr<tensor<32x128xf32, #blocked>>) {
+        scf.yield %adv, %arg5 : !tt.ptr<tensor<32x128xf32, #blocked>>, !tt.ptr<tensor<32x128xf32, #blocked>>
+      } else {
+        scf.yield %arg5, %adv : !tt.ptr<tensor<32x128xf32, #blocked>>, !tt.ptr<tensor<32x128xf32, #blocked>>
+      }
+      // CHECK: scf.yield {{.*}} : !tt.ptr<tensor<128x32xf32, [[BLOCKED_LAYOUT]]>>, !tt.ptr<tensor<32x128xf32, [[BLOCKED_LAYOUT1]]>>
+      scf.yield %arg4, %3#0 : !tt.ptr<tensor<128x32xf32, #blocked>>, !tt.ptr<tensor<32x128xf32, #blocked>>
+    }
+    // CHECK: [[ADV:%.*]] = tt.advance {{.*}} : <tensor<128x32xf32, [[BLOCKED_LAYOUT]]>>
+    %3 = tt.advance %2#0, [%c0_i32, %c16_i32] : <tensor<128x32xf32, #blocked>>
+    // CHECK: [[LOAD:%.*]] = tt.load {{.*}} : !tt.ptr<tensor<32x128xf32, [[BLOCKED_LAYOUT1]]>>
+    %4 = tt.load %1 {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<32x128xf32, #blocked>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
@@ -14,6 +14,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -212,23 +213,26 @@ struct CoalescePass
         user->dumpPretty();
       });
 
-      if (auto forOp = dyn_cast<scf::ForOp>(user)) {
-        propagateLayoutToArgsAndBody(forOp, val, layout, rewriter);
-        continue;
-      }
-      if (auto whileOp = dyn_cast<scf::WhileOp>(user)) {
-        propagateLayoutToArgsAndBody(whileOp, val, layout, rewriter);
+      if (auto loopOp = dyn_cast<LoopLikeOpInterface>(user)) {
+        propagateLayoutToArgsAndBody(loopOp, val, layout, rewriter);
         continue;
       }
       if (auto yieldOp = dyn_cast<scf::YieldOp>(user)) {
-        if (auto loopOp = yieldOp->getParentOfType<LoopLikeOpInterface>()) {
-          for (OpOperand &operand : llvm::make_filter_range(
-                   yieldOp->getOpOperands(),
-                   [&val](OpOperand &operand) { return operand.get() == val; }))
-            propagateLayoutToLoopResult(loopOp, operand.getOperandNumber(),
-                                        layout, rewriter);
-          continue;
-        }
+        Operation *parentOp = yieldOp->getParentOp();
+        for (OpOperand &operand : llvm::make_filter_range(
+                 yieldOp->getOpOperands(),
+                 [&val](OpOperand &operand) { return operand.get() == val; }))
+          TypeSwitch<Operation *>(parentOp)
+              .Case<LoopLikeOpInterface, scf::IfOp>([&](auto op) {
+                propagateLayoutToOperationResult(op, operand.getOperandNumber(),
+                                                 layout, rewriter);
+              })
+              .Default([](auto op) {
+                llvm::report_fatal_error(llvm::Twine(
+                    "Unsupported parent operation for scf.yield: '" +
+                    op->getName().getStringRef() + "'"));
+              });
+        continue;
       }
       if (auto condOp = dyn_cast<scf::ConditionOp>(user)) {
         if (auto whileOp = condOp->getParentOfType<scf::WhileOp>()) {
@@ -295,13 +299,16 @@ struct CoalescePass
     }
   }
 
-  // Modify the \p layout to the loop's operand identified by \p resNum, and
-  // propagate the modified loop results to its users.
-  void propagateLayoutToLoopResult(LoopLikeOpInterface loopOp, unsigned resNum,
-                                   Attribute layout,
-                                   IRRewriter &rewriter) const {
-    Value loopRes = loopOp->getResult(resNum);
-    rewriter.modifyOpInPlace(loopOp, [&]() {
+  // Modify the \p layout of the operation \p op result identified by \p resNum,
+  // and propagate the modified operation result to its users.
+  template <typename OpType,
+            typename = std::enable_if_t<
+                llvm::is_one_of<OpType, LoopLikeOpInterface, scf::IfOp>::value>>
+  void propagateLayoutToOperationResult(OpType op, unsigned resNum,
+                                        Attribute layout,
+                                        IRRewriter &rewriter) const {
+    Value loopRes = op->getResult(resNum);
+    rewriter.modifyOpInPlace(op, [&]() {
       assert(tt::isTensorPointerType(loopRes.getType()) &&
              "Expecting blocked pointers");
       Type resType = loopRes.getType();