From 8c3ad61ae305a54735f4a5f61d0358c9954859c9 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Mon, 13 Oct 2025 13:22:36 +0000
Subject: [PATCH 1/3] use -loop-invariant-subset-hoisting in transform fuse.

---
 .../SCF/Transforms/TileUsingInterface.cpp       | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 29b770fb4b279..32b2560484a41 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
@@ -1316,7 +1317,15 @@ getUntiledProducerFromSliceSource(OpOperand *source,
                                   ArrayRef<LoopLikeOpInterface> loops) {
   std::optional<OpOperand *> destinationIterArg;
   assert(!loops.empty() && "expected non empty loops container");
+
+  // The `extractOp` may not reside within the innermost loop, calculate the
+  // distance between it and the last LoopLikeInterfaceOp. Adding this
+  // `distance` to `loopIt` yields the start of the loop.
   auto loopIt = loops.rbegin();
+  auto parentLoop = source->getOwner()->getParentOfType<LoopLikeOpInterface>();
+  const LoopLikeOpInterface *it = llvm::find(loops, parentLoop);
+  int64_t distance = std::distance(loops.begin(), it);
+  loopIt += (loops.size() - distance - 1);
   while (loopIt != loops.rend() && isa<BlockArgument>(source->get())) {
     auto iterArg = cast<BlockArgument>(source->get());
     auto loop = *loopIt;
@@ -1347,7 +1356,6 @@ mlir::scf::tileAndFuseProducerOfSlice(
 
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(candidateSliceOp);
-
   // 2. Clone the fused producer
   // 2a. Compute the destination operands to use for the cloned operation.
   SmallVector<Value> origDestinationTensors, clonedOpDestinationTensors;
@@ -1750,6 +1758,13 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
                                      replacements};
   }
 
+  // The extract_slice op is created in the innermost loop by default. Using
+  // hoistLoopInvariantSubsets improves the position of the extract_slice op
+  // within the loops, allowing the fuse Op to be created in the correct loop.
+  for (LoopLikeOpInterface loop : loops) {
+    (void)hoistLoopInvariantSubsets(rewriter, loop);
+  }
+
   // Since the loop gets potentially replaced during fusion, we need to track
   // the mutation of replacement values. To do this, we attach a listener to
   // update the replacements as they happen.

From e2d9db38194229068e2f39cd552072833cef15e0 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Mon, 13 Oct 2025 16:36:42 +0000
Subject: [PATCH 2/3] add pooling_ncw_max_fill_fuse example.

---
 .../SCF/Transforms/TileUsingInterface.cpp     | 10 ++--
 .../tile-and-fuse-using-interface.mlir        | 47 +++++++++++++++++--
 .../tile-fuse-and-yield-using-interface.mlir  |  2 +-
 .../tile-fuse-and-yield-using-scfforall.mlir  |  2 +-
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 32b2560484a41..4684ad5dd84ae 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1759,11 +1759,13 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF(
   }
 
   // The extract_slice op is created in the innermost loop by default. Using
-  // hoistLoopInvariantSubsets improves the position of the extract_slice op
-  // within the loops, allowing the fuse Op to be created in the correct loop.
-  for (LoopLikeOpInterface loop : loops) {
+  // `moveLoopInvariantCode` and `hoistLoopInvariantSubsets` improves the
+  // position of the extract_slice op within the loops, allowing the fuse Op to
+  // be created in the correct loop.
+  for (LoopLikeOpInterface loop : loops)
+    (void)moveLoopInvariantCode(loop);
+  for (LoopLikeOpInterface loop : loops)
     (void)hoistLoopInvariantSubsets(rewriter, loop);
-  }
 
   // Since the loop gets potentially replaced during fusion, we need to track
   // the mutation of replacement values. To do this, we attach a listener to
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 8116044594fca..1f2d08cf2959e 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -28,9 +28,9 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:   %[[INIT:.+]] = tensor.empty
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] =
 // CHECK-SAME:       iter_args(%[[ITERARG0:.+]] = %[[INIT]])
+//  CHECK-DAG:     %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], 0]
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] =
 // CHECK-SAME:         iter_args(%[[ITERARG1:.+]] = %[[ITERARG0]])
-//  CHECK-DAG:       %[[LHS_TILE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], 0]
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = tensor.extract_slice %[[ARG1]][0, %[[IV1]]]
 //  CHECK-DAG:       %[[INIT_TILE:.+]] = tensor.extract_slice %[[ITERARG1]][%[[IV0]], %[[IV1]]]
 //      CHECK:       %[[FILL_TILE:.+]] = linalg.fill
@@ -141,6 +141,7 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG:   %[[INIT0:.+]] = tensor.empty(%[[D0]], %[[D1]])
 //  CHECK-DAG:   %[[D2:.+]] = tensor.dim %[[RHS1]], %[[C1]]
 //      CHECK:   %[[INIT1:.+]] = tensor.empty(%[[D0]], %[[D2]])
+//  CHECK-DAG:   %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //      CHECK:   scf.for %[[IV:[a-zA-Z0-9]+]] =
 // CHECK-SAME:       iter_args(%[[ITERARG:.+]] = %[[INIT1]])
 //  CHECK-DAG:     %[[LHS0_TILE:.+]] = tensor.extract_slice %[[LHS0]][%[[IV]], 0]
@@ -151,7 +152,6 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[GEMM0_TILE:.+]] = linalg.matmul
 // CHECK-SAME:         ins(%[[LHS0_TILE]], %[[RHS0_TILE]] :
 // CHECK-SAME:         outs(%[[FILL0_TILE]] :
-//  CHECK-DAG:     %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //  CHECK-DAG:     %[[INIT1_TILE:.+]] = tensor.extract_slice %[[ITERARG]][%[[IV]], 0]
 //      CHECK:     %[[FILL1_TILE:.+]] = linalg.fill
 // CHECK-SAME:         outs(%[[INIT1_TILE]] :
@@ -444,6 +444,7 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ORIG_GEMM2]], %[[C0]]
 //   CHECK-DAG:   %[[N2:.+]] = tensor.dim %[[ORIG_GEMM2]], %[[C1]]
 //   CHECK-DAG:   %[[N3:.+]] = tensor.dim %[[ARG5]], %[[C1]]
+//   CHECK-DAG:   %[[SLICE_ARG5:.+]] = tensor.extract_slice %[[ARG5]][0, 0] [%[[N2]], %[[N3]]]
 //       CHECK:   %[[R0:.+]] = scf.for %[[IV:[a-zA-Z0-9_]+]] =
 //  CHECK-SAME:       iter_args(%[[ARG8:.+]] = %[[ARG6]]) -> (tensor<?x?xf32>) {
 //   CHECK-DAG:     %[[N1:.+]] = tensor.dim %[[ORIG_GEMM1]], %[[C1]]
@@ -458,7 +459,6 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:     %[[SLICE_ARG4:.+]] = tensor.extract_slice %[[ARG4]][%[[IV]], 0] [%[[TILE_M]], %[[N2]]]
 //   CHECK-DAG:     %[[TILE_GEMM2:.+]] = linalg.matmul ins(%[[TILE_GEMM1]], %[[SLICE_ARG3]] :
 //  CHECK-SAME:         outs(%[[SLICE_ARG4]] :
-//   CHECK-DAG:     %[[SLICE_ARG5:.+]] = tensor.extract_slice %[[ARG5]][0, 0] [%[[N2]], %[[N3]]]
 //   CHECK-DAG:     %[[SLICE_ARG6:.+]] = tensor.extract_slice %[[ARG8]][%[[IV]], 0] [%[[TILE_M]], %[[N3]]]
 //   CHECK-DAG:     %[[TILE_GEMM3:.+]] = linalg.matmul
 //  CHECK-SAME:         ins(%[[TILE_GEMM2]], %[[SLICE_ARG5]] :
@@ -688,3 +688,44 @@ module attributes {transform.with_named_sequence} {
 // CHECK:           }
 // CHECK:         }
 
+// -----
+
+func.func @pooling_ncw_max_fill_fuse(%input: tensor<?x?x?xf32>, %fake: tensor<?xf32>, %init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %res = linalg.pooling_ncw_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<?x?x?xf32>, tensor<?xf32>)
+    outs(%fill: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %res : tensor<?x?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+       %arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pooling_ncw_max"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %tiled_pool, %loops0:4 = transform.structured.fuse %0 {tile_sizes = [1, 16, 1, 1], apply_cleanup = true}
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: func.func @pooling_ncw_max_fill_fuse(
+//  CHECK-SAME:   %[[INPUT:.*]]: tensor<?x?x?xf32>,
+//  CHECK-SAME:   %[[FAKE:.*]]: tensor<?xf32>,
+//  CHECK-SAME:   %[[INIT:.*]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+//       CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:     iter_args(%[[ITERARG0:.+]] = %[[INIT]])
+//       CHECK:   scf.for %[[IV1:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:     iter_args(%[[ITERARG1:.+]] = %[[ITERARG0]])
+//       CHECK:   scf.for %[[IV2:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:     iter_args(%[[ITERARG2:.+]] = %[[ITERARG1]])
+//       CHECK:     %[[FILL_EXTRACT:.*]] = tensor.extract_slice %[[ITERARG2]]{{\[}}%[[IV0]], %[[IV1]], %[[IV2]]]
+//       CHECK:     %[[TILED_FILL:.*]] = linalg.fill ins(%[[ZERO]] : f32) outs(%[[FILL_EXTRACT]] : tensor<1x?x1xf32>) -> tensor<1x?x1xf32>
+//       CHECK:   scf.for %[[IV3:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:     iter_args(%[[ITERARG3:.*]] = %[[ITERARG2]], %[[ITERARG4:.*]] = %[[TILED_FILL]])
+//       CHECK:     %[[TILED_INPUT:.*]] = tensor.extract_slice %[[INPUT]]{{\[}}%[[IV0]], %[[IV1]]
+//       CHECK:     %[[TILED_FAKE:.*]] = tensor.extract_slice %[[FAKE]]{{\[}}%[[IV3]]]
+//       CHECK:     linalg.pooling_ncw_max
+//  CHECK-SAME:       ins(%[[TILED_INPUT]], %[[TILED_FAKE]] :
+//  CHECK-SAME:       outs(%[[ITERARG4]] :
diff --git a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
index 3c0ada9d2cabc..1df1e1dcf7d58 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-interface.mlir
@@ -37,6 +37,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<?x?xf32>)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //      CHECK:   %[[RESULT:.+]]:2 = scf.for %[[IV:[a-zA-Z0-9]+]] =
 // CHECK-SAME:       iter_args(%[[ITERARG0:[a-zA-Z0-9]+]] = %[[INIT1]], %[[ITERARG1:[a-zA-Z0-9]+]] = %[[INIT0]])
 //  CHECK-DAG:     %[[LHS0_TILE:.+]] = tensor.extract_slice %[[LHS0]][%[[IV]], 0]
@@ -47,7 +48,6 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[GEMM0_TILE:.+]] = linalg.matmul
 // CHECK-SAME:         ins(%[[LHS0_TILE]], %[[RHS0_TILE]] :
 // CHECK-SAME:         outs(%[[FILL0_TILE]] :
-//  CHECK-DAG:     %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //  CHECK-DAG:     %[[INIT1_TILE:.+]] = tensor.extract_slice %[[ITERARG0]][%[[IV]], 0]
 //      CHECK:     %[[FILL1_TILE:.+]] = linalg.fill
 // CHECK-SAME:         outs(%[[INIT1_TILE]] :
diff --git a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
index 8fc8f3245be15..f5370cd86dd9f 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
@@ -37,6 +37,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[INIT1:[a-zA-Z0-9]+]]: tensor<?x?xf32>)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //      CHECK:   %[[RESULT:.+]]:2 = scf.forall (%[[IV:[a-zA-Z0-9]+]]) =
 // CHECK-SAME:       shared_outs(%[[ITERARG0:[a-zA-Z0-9]+]] = %[[INIT1]], %[[ITERARG1:[a-zA-Z0-9]+]] = %[[INIT0]])
 //  CHECK-DAG:     %[[LHS0_TILE:.+]] = tensor.extract_slice %[[LHS0]][%[[IV]], 0]
@@ -47,7 +48,6 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:     %[[GEMM0_TILE:.+]] = linalg.matmul
 // CHECK-SAME:         ins(%[[LHS0_TILE]], %[[RHS0_TILE]] :
 // CHECK-SAME:         outs(%[[FILL0_TILE]] :
-//  CHECK-DAG:     %[[RHS1_TILE:.+]] = tensor.extract_slice %[[RHS1]][0, 0]
 //  CHECK-DAG:     %[[INIT1_TILE:.+]] = tensor.extract_slice %[[ITERARG0]][%[[IV]], 0]
 //      CHECK:     %[[FILL1_TILE:.+]] = linalg.fill
 // CHECK-SAME:         outs(%[[INIT1_TILE]] :

From 414a12302a686fe25264a8f6ba08f822bc787378 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Mon, 13 Oct 2025 16:52:41 +0000
Subject: [PATCH 3/3] fix test

---
 .../TilingInterface/tile-and-fuse-using-interface.mlir          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 1f2d08cf2959e..70de5e73a2773 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -703,7 +703,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(
        %arg0: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.pooling_ncw_max"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    %tiled_pool, %loops0:4 = transform.structured.fuse %0 {tile_sizes = [1, 16, 1, 1], apply_cleanup = true}
+    %a, %b, %c, %d, %e = transform.structured.fuse %0 tile_sizes [1, 16, 1, 1]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }