[Codegen] Canonicalize loops and subviews after copy vectorization (#22344)

jtuyls · web-flow · commit 9e7183e961e1 · 2025-10-21T19:44:56.000+02:00
The new `memref.copy` tiling and vectorization patterns can generate
loops and subviews that can be canonicalized further. If this isn't
done, it can lead to errors in a later `FoldMemRefAliasOp` pass as the
subviews are supposed to be canonicalized at that point.

Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorizeMemrefCopy.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorizeMemrefCopy.cpp
@@ -5,9 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -60,6 +62,11 @@ struct TileLinalgCopy final : OpRewritePattern<memref::CopyOp> {
     for (Operation *tiledOp : tilingResult->tiledOps) {
       tiledOp->setAttr(kIsTiled, mlir::UnitAttr::get(copyOp.getContext()));
     }
+    // Put an marker on the loop ops, so they can be targeted for
+    // simplification.
+    for (LoopLikeOpInterface loop : llvm::reverse(tilingResult->loops)) {
+      loop->setAttr(kIsTiled, mlir::UnitAttr::get(loop.getContext()));
+    }
     if (tilingInterfaceOp->use_empty()) {
       rewriter.eraseOp(tilingInterfaceOp);
     }
@@ -104,12 +111,18 @@ struct VectorizeMemrefCopyPass final
     patterns.add<TileLinalgCopy>(&getContext());
     patterns.add<linalg::CopyVectorizationPattern>(&getContext());
     patterns.add<ConvertLinalgCopyToMemrefCopy>(&getContext());
+    // Try to remove generated single iteration loops and canonicalize generated
+    // subview operations.
+    populateRemoveSingleIterationLoopPattern(
+        patterns,
+        [&](scf::ForOp forOp) -> bool { return forOp->hasAttr(kIsTiled); });
+    memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);
     (void)applyPatternsGreedily(funcOp, std::move(patterns));
 
     // Clean up the temporary isTiled markers.
-    funcOp->walk([](memref::CopyOp copyOp) {
-      if (copyOp->hasAttr(kIsTiled)) {
-        copyOp->removeAttr(kIsTiled);
+    funcOp->walk([](Operation *op) {
+      if (op->hasAttr(kIsTiled)) {
+        op->removeAttr(kIsTiled);
       }
     });
   }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vectorize_memref_copy.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vectorize_memref_copy.mlir
@@ -9,13 +9,9 @@ func.func @memref_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
 //  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
-//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
-//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
-//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
-//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
-//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
+//   CHECK-DAG:   %[[POISON:.+]] = ub.poison : f32
+//       CHECK:   %[[RD:.+]] = vector.transfer_read %[[SOURCE]][%[[C0]], %[[C0]]], %[[POISON]] {in_bounds = [true, true]} : memref<2x2xf32>, vector<2x2xf32>
+//       CHECK:   vector.transfer_write %[[RD]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<2x2xf32>, memref<2x2xf32>
 
 // -----
 
@@ -28,13 +24,9 @@ func.func @linalg_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
 //  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
-//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
-//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
-//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
-//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
-//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
+//   CHECK-DAG:   %[[POISON:.+]] = ub.poison : f32
+//       CHECK:   %[[RD:.+]] = vector.transfer_read %[[SOURCE]][%[[C0]], %[[C0]]], %[[POISON]] {in_bounds = [true, true]} : memref<2x2xf32>, vector<2x2xf32>
+//       CHECK:   vector.transfer_write %[[RD]], %[[DEST]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<2x2xf32>, memref<2x2xf32>
 
 // -----
 
@@ -44,6 +36,7 @@ func.func @memref_copy_not_multiple_of_preferred(%source: memref<2x6xf32>, %dest
   memref.copy %source, %dest : memref<2x6xf32> to memref<2x6xf32>
   return
 }
+
 // CHECK-LABEL: func.func @memref_copy_not_multiple_of_preferred
 //  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x6xf32>
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x6xf32>
@@ -74,11 +67,10 @@ func.func @memref_copy_not_multiple_on_penultimate_dim(%source: memref<3x2xf32>,
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 //       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C3]] step %[[C2]]
-//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
-//       CHECK:       %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
-//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
-//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
-//       CHECK:       memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
+//       CHECK:     %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
+//       CHECK:     %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], 0] [%[[MIN]], 2] [1, 1]
+//       CHECK:     %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], 0] [%[[MIN]], 2] [1, 1]
+//       CHECK:     memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
 
 // -----
 
@@ -91,14 +83,12 @@ func.func @memref_copy_dynamic(%source: memref<?x4xf32>, %dest: memref<?x4xf32>)
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<?x4xf32>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 //   CHECK-DAG:   %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C0]] : memref<?x4xf32>
 //       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[DIM]] step %[[C1]]
-//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C4]] step %[[C4]]
-//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
-//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
-//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
-//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
+//       CHECK:     %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], 0] [1, 4] [1, 1]
+//       CHECK:     %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], 0] [1, 4] [1, 1]
+//       CHECK:     %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
+//       CHECK:     vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
 
 // -----
 
@@ -119,3 +109,68 @@ func.func @memref_copy_dynamic_inner_dim(%source: memref<4x?xf32>, %dest: memref
 //       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
 //       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
 //       CHECK:       memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
+
+// -----
+
+// Test that the single iteration loops are removed and the subview ops are canonicalized
+// (`memref<1x?xbf16, strided<[4, 1]>` instead of `memref<1x?xbf16, strided<[4, 1], offset: ?>`).
+
+func.func @memref_copy_fully_dynamic(%source: memref<1x4xbf16>, %dest: memref<32x?xbf16, strided<[40, 1], offset: ?>>, %dim: index) {
+  %c0 = arith.constant 0 : index
+  scf.forall (%arg0) in (3) {
+    %0 = affine.min affine_map<(d0) -> (d0 * -16 + 40, 16)>(%arg0)
+    %1:2 = affine.delinearize_index %dim into (2, 64) : index, index
+    %2:3 = affine.delinearize_index %1#1 into (4, 16) : index, index, index
+    %3 = affine.linearize_index disjoint [%2#1, %c0] by (4, 4) : index
+    %4 = affine.linearize_index disjoint [%1#0, %2#2] by (2, 16) : index
+    %5 = affine.max affine_map<()[s0] -> (-s0 + 32, 0)>()[%4]
+    %6 = affine.min affine_map<()[s0] -> (1, s0)>()[%5]
+    %7 = affine.max affine_map<(d0)[s0] -> (0, d0 - s0)>(%0)[%3]
+    %8 = affine.min affine_map<(d0) -> (4, d0)>(%7)
+    %subview_0 = memref.subview %source[0, 0] [%6, %8] [1, 1] : memref<1x4xbf16> to memref<?x?xbf16, strided<[4, 1]>>
+    %subview_1 = memref.subview %dest[%4, %3] [%6, %8] [1, 1] : memref<32x?xbf16, strided<[40, 1], offset: ?>> to memref<?x?xbf16, strided<[40, 1], offset: ?>>
+    memref.copy %subview_0, %subview_1 : memref<?x?xbf16, strided<[4, 1]>> to memref<?x?xbf16, strided<[40, 1], offset: ?>>
+  }
+  return
+}
+// CHECK-LABEL: func.func @memref_copy_fully_dynamic
+//  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<1x4xbf16>
+//  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<32x?xbf16, strided<[40, 1], offset: ?>>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[LIN_0:.+]] = affine.linearize_index disjoint [%{{.+}}, %{{.+}}] by (4, 4) : index
+//   CHECK-DAG:   %[[LIN_1:.+]] = affine.linearize_index disjoint [%{{.+}}, %{{.+}}] by (2, 16) : index
+//   CHECK-DAG:   %[[MIN_0:.+]] = affine.min affine_map<()[s0] -> (1, s0)>()[%{{.+}}]
+//   CHECK-DAG:   %[[MIN_1:.+]] = affine.min affine_map<(d0) -> (4, d0)>(%{{.+}})
+//   CHECK-DAG:   %[[SUBVIEW_0:.+]] = memref.subview %[[SOURCE]][0, 0] [%[[MIN_0]], %[[MIN_1]]] [1, 1]
+//  CHECK-SAME:   memref<1x4xbf16> to memref<?x?xbf16, strided<[4, 1]>>
+//   CHECK-DAG:   %[[SUBVIEW_1:.+]] = memref.subview %[[DEST]][%[[LIN_1]], %[[LIN_0]]] [%[[MIN_0]], %[[MIN_1]]] [1, 1]
+//  CHECK-SAME:   memref<32x?xbf16, strided<[40, 1], offset: ?>> to memref<?x?xbf16, strided<[40, 1], offset: ?>>
+//   CHECK-DAG:   %[[CMP_0:.+]] = arith.cmpi sgt, %[[MIN_0]], %[[C0]] : index
+//       CHECK:   scf.if %[[CMP_0]] {
+//       CHECK:     %[[CMP_1:.+]] = arith.cmpi sgt, %[[MIN_1]], %[[C0]] : index
+//       CHECK:     scf.if %[[CMP_1]] {
+//       CHECK:       %[[MIN_2:.+]] = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 8)>(%[[C0]])[%[[MIN_1]]]
+//       CHECK:       %[[SUBIEW_2:.+]] = memref.subview %[[SUBVIEW_0]][0, 0] [1, %[[MIN_2]]] [1, 1]
+//  CHECK-SAME:       memref<?x?xbf16, strided<[4, 1]>> to memref<1x?xbf16, strided<[4, 1]>>
+//       CHECK:       %[[SUBVIEW_3:.+]] = memref.subview %[[SUBVIEW_1]][0, 0] [1, %[[MIN_2]]] [1, 1]
+//  CHECK-SAME:       memref<?x?xbf16, strided<[40, 1], offset: ?>> to memref<1x?xbf16, strided<[40, 1], offset: ?>>
+//       CHECK:       memref.copy %[[SUBIEW_2]], %[[SUBVIEW_3]]
+
+// -----
+
+// Test that scf.for operations with `_is_tiled` attribute are simplified. The `memref.copy` should still be vectorized as well.
+
+func.func @for_with_tiled_attr(%source: memref<4x?xf32>, %dest: memref<4x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  scf.for %arg0 = %c0 to %c1 step %c1 {
+    %subview_0 = memref.subview %source[%arg0, 0] [4, 1] [1, 1] : memref<4x?xf32> to memref<4x1xf32, strided<[?, 1], offset: ?>>
+    %subview_1 = memref.subview %dest[%arg0, 0] [4, 1] [1, 1] : memref<4x?xf32> to memref<4x1xf32, strided<[?, 1], offset: ?>>
+    memref.copy %subview_0, %subview_1 : memref<4x1xf32, strided<[?, 1], offset: ?>> to memref<4x1xf32, strided<[?, 1], offset: ?>>
+  } {_is_tiled}
+  return
+}
+// CHECK-LABEL: func.func @for_with_tiled_attr
+//   CHECK-NOT:   scf.for
+//       CHECK:   vector.transfer_read
+//       CHECK:   vector.transfer_write
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp b/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp
@@ -60,17 +60,22 @@ static void replaceForWithIf(PatternRewriter &rewriter, scf::ForOp op,
 namespace {
 /// Rewriting pattern that replaces single-iteration loops with their bodies.
 struct SimplifyTrivialLoops : public OpRewritePattern<scf::ForOp> {
-  using Base::Base;
+
+  SimplifyTrivialLoops(MLIRContext *context, ForControlFnRef controlFn)
+      : OpRewritePattern(context), controlFn(controlFn) {}
 
   LogicalResult matchAndRewrite(scf::ForOp op,
                                 PatternRewriter &rewriter) const override {
-    if (!(neverRunsSecondIteration(op))) {
-      return failure();
+    if (controlFn && !controlFn(op)) {
+      return rewriter.notifyMatchFailure(
+          op, "doesn't match according to the the control function");
     }
-
-    // The second iteration is never run
-    // so the loop atmost can have 1 iteration. Inline its body and remove the
-    // loop.
+    if (!neverRunsSecondIteration(op)) {
+      return rewriter.notifyMatchFailure(op,
+                                         "is not a single-iteration for loop");
+    }
+    // The second iteration is never run so the loop atmost can have 1
+    // iteration. Inline its body and remove the loop.
     SmallVector<Value> blockArgs;
     blockArgs.reserve(op.getInitArgs().size() + 1);
     blockArgs.push_back(op.getLowerBound());
@@ -82,12 +87,16 @@ struct SimplifyTrivialLoops : public OpRewritePattern<scf::ForOp> {
     }
     return success();
   }
+
+private:
+  ForControlFnRef controlFn;
 };
 
 } // namespace
 
-void populateRemoveSingleIterationLoopPattern(RewritePatternSet &patterns) {
-  patterns.add<SimplifyTrivialLoops>(patterns.getContext());
+void populateRemoveSingleIterationLoopPattern(RewritePatternSet &patterns,
+                                              ForControlFnRef controlFn) {
+  patterns.add<SimplifyTrivialLoops>(patterns.getContext(), controlFn);
 }
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/Transforms.h b/compiler/src/iree/compiler/Codegen/Transforms/Transforms.h
@@ -25,6 +25,8 @@
 
 namespace mlir::iree_compiler {
 
+using ForControlFnRef = llvm::function_ref<bool(scf::ForOp)>;
+
 /// Get the `offsets`, `sizes` and `strides` for a `storeOp` (or `loadOp`). This
 /// method clones the operations that generate the `Value`s used for
 /// specifying the offsets, sizesm strides and dynamic dims of the
@@ -100,7 +102,8 @@ using GetMinMaxExprFn =
 
 /// Insert pattern to remove single iteration loop. The pattern will detect
 /// single iteration loops based on the range returned ValueBoundsOpInterface.
-void populateRemoveSingleIterationLoopPattern(RewritePatternSet &patterns);
+void populateRemoveSingleIterationLoopPattern(
+    RewritePatternSet &patterns, ForControlFnRef controlFn = nullptr);
 
 // Group of Alloc operations that have overlapping liveranges.
 using AliasGroup = SmallVector<Operation *>;