[RemoveLayoutConversion]: Destroy 'ttg.convert_layout' operations unless they have a user (#4880)

etiotto · web-flow · commit 4199e4e7c9b0 · 2025-08-13T11:24:58.000-04:00
The fix prevents the pass from incorrectly removing layout conversion operations that are still being used. Fixes issue #4866. Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
@@ -286,3 +286,47 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// COM: Fix for issue #4866
+
+// CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+// CHECK: #[[BLOCKED1:.+]] = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked3 = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32, ttig.support_dpas, ttig.support_sg_2d_block} {
+  tt.func public @test_4866(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i64) {
+    %c1_i32 = arith.constant 1 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #blocked>
+    %cst_0 = arith.constant dense<5.000000e-01> : tensor<16x32xf32, #blocked1>
+    %c64_i64 = arith.constant 64 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i64 = arith.constant 1 : i64
+    %c16_i32 = arith.constant 16 : i32
+    %0 = tt.make_tensor_ptr %arg0, [%arg2, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c32_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16, #blocked2>>
+    %1 = tt.make_tensor_ptr %arg1, [%arg2, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c32_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf32, #blocked2>>
+    %2:2 = scf.for %arg3 = %c0_i32 to %c16_i32 step %c1_i32 iter_args(%arg4 = %0, %arg5 = %1) -> (!tt.ptr<tensor<16x32xf16, #blocked2>>, !tt.ptr<tensor<16x32xf32, #blocked2>>)  : i32 {
+      // CHECK: scf.for {{.*}}
+      // CHECK: [[LOAD_RES:%.*]] = tt.load {{.*}} : !tt.ptr<tensor<16x32xf16, #[[BLOCKED1]]>>
+      // CHECK: [[CONV1:%.*]] = ttg.convert_layout [[LOAD_RES]] : tensor<16x32xf16, #[[BLOCKED1]]> -> tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>>
+      // CHECK: [[DOT_RES:%.*]] = tt.dot %cst_0, [[CONV1]], %cst : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #[[BLOCKED]]}>> * tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED]]}>> -> tensor<16x32xf32, #[[BLOCKED]]>
+      // CHECK: [[CONV2:%.*]] = ttg.convert_layout [[DOT_RES]] : tensor<16x32xf32, #[[BLOCKED]]> -> tensor<16x32xf32, #[[BLOCKED1]]>
+      // CHECK: tt.store {{.*}}, [[CONV2]] : !tt.ptr<tensor<16x32xf32, #[[BLOCKED1]]>>
+      %3 = tt.load %arg4 : !tt.ptr<tensor<16x32xf16, #blocked2>>
+      %4 = ttg.convert_layout %3 : tensor<16x32xf16, #blocked2> -> tensor<16x32xf16, #blocked1>
+      %5 = ttg.convert_layout %cst : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked3}>>
+      %6 = ttg.convert_layout %4 : tensor<16x32xf16, #blocked1> -> tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked3}>>
+      %7 = ttg.convert_layout %cst_0 : tensor<16x32xf32, #blocked1> -> tensor<16x32xf32, #blocked3>
+      %8 = tt.dot %5, %6, %7 : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked3}>> * tensor<16x32xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked3}>> -> tensor<16x32xf32, #blocked3>
+      %9 = ttg.convert_layout %8 : tensor<16x32xf32, #blocked3> -> tensor<16x32xf32, #blocked1>
+      %10 = ttg.convert_layout %9 : tensor<16x32xf32, #blocked1> -> tensor<16x32xf32, #blocked2>
+      tt.store %arg5, %10 : !tt.ptr<tensor<16x32xf32, #blocked2>>
+      scf.yield %arg4, %arg5 : !tt.ptr<tensor<16x32xf16, #blocked2>>, !tt.ptr<tensor<16x32xf32, #blocked2>>
+    }
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -240,6 +240,16 @@ void LayoutPropagation::initAnchorLayout() {
       }
     }
   });
+
+  LLVM_DEBUG({
+    DBGS() << "Anchors: \n";
+    for (auto [v, info] : layouts) {
+      DBGS().indent(2) << "Value:  " << v << "\n";
+      DBGS().indent(2) << "Encodings (" << info.encodings.size() << "):\n";
+      for (Attribute encoding : info.encodings)
+        DBGS().indent(4) << encoding << "\n";
+    }
+  });
 }
 
 void LayoutPropagation::setEncoding(ValueRange values, LayoutInfo &info,
@@ -337,12 +347,10 @@ SmallVector<Value> LayoutPropagation::propagateToUsers(Value value,
         return isMMAorMMADerived;
       };
       if (llvm::all_of(info.encodings, checkMMAorMMADerived)) {
+        SmallVector<Value> valuesToChange{storeOp.getPtr(), storeOp.getValue()};
         if (storeOp.getMask())
-          setEncoding({storeOp.getPtr(), storeOp.getValue(), storeOp.getMask()},
-                      info, changed, user);
-        else
-          setEncoding({storeOp.getPtr(), storeOp.getValue()}, info, changed,
-                      user);
+          valuesToChange.emplace_back(storeOp.getMask());
+        setEncoding(valuesToChange, info, changed, user);
       }
       continue;
     }
@@ -481,8 +489,11 @@ void LayoutPropagation::rewriteRegion(Region &region) {
       }
     }
   }
-  for (Operation *op : llvm::reverse(opToDelete))
-    op->erase();
+
+  for (Operation *op : llvm::reverse(opToDelete)) {
+    if (op->getUsers().empty())
+      op->erase();
+  }
 }
 
 void LayoutPropagation::map(Value old, Value newV) {

Original file line number	Diff line number	Diff line change
`@@ -240,6 +240,16 @@ void LayoutPropagation::initAnchorLayout() {`
`240`	`240`	`}`
`241`	`241`	`}`
`242`	`242`	`});`
	`243`	`+`
	`244`	`+ LLVM_DEBUG({`
	`245`	`+ DBGS() << "Anchors: \n";`
	`246`	`+ for (auto [v, info] : layouts) {`
	`247`	`+ DBGS().indent(2) << "Value: " << v << "\n";`
	`248`	`+ DBGS().indent(2) << "Encodings (" << info.encodings.size() << "):\n";`
	`249`	`+ for (Attribute encoding : info.encodings)`
	`250`	`+ DBGS().indent(4) << encoding << "\n";`
	`251`	`+ }`
	`252`	`+ });`
`243`	`253`	`}`
`244`	`254`
`245`	`255`	`void LayoutPropagation::setEncoding(ValueRange values, LayoutInfo &info,`
`@@ -337,12 +347,10 @@ SmallVector<Value> LayoutPropagation::propagateToUsers(Value value,`
`337`	`347`	`return isMMAorMMADerived;`
`338`	`348`	`};`
`339`	`349`	`if (llvm::all_of(info.encodings, checkMMAorMMADerived)) {`
	`350`	`+ SmallVector<Value> valuesToChange{storeOp.getPtr(), storeOp.getValue()};`
`340`	`351`	`if (storeOp.getMask())`
`341`		`- setEncoding({storeOp.getPtr(), storeOp.getValue(), storeOp.getMask()},`
`342`		`- info, changed, user);`
`343`		`- else`
`344`		`- setEncoding({storeOp.getPtr(), storeOp.getValue()}, info, changed,`
`345`		`- user);`
	`352`	`+ valuesToChange.emplace_back(storeOp.getMask());`
	`353`	`+ setEncoding(valuesToChange, info, changed, user);`
`346`	`354`	`}`
`347`	`355`	`continue;`
`348`	`356`	`}`
`@@ -481,8 +489,11 @@ void LayoutPropagation::rewriteRegion(Region &region) {`
`481`	`489`	`}`
`482`	`490`	`}`
`483`	`491`	`}`
`484`		`- for (Operation *op : llvm::reverse(opToDelete))`
`485`		`- op->erase();`
	`492`	`+`
	`493`	`+ for (Operation *op : llvm::reverse(opToDelete)) {`
	`494`	`+ if (op->getUsers().empty())`
	`495`	`+ op->erase();`
	`496`	`+ }`
`486`	`497`	`}`
`487`	`498`
`488`	`499`	`void LayoutPropagation::map(Value old, Value newV) {`