[Codegen] Tile memref.copy when vectorizing for dynamic dims (iree-org#22168)

jtuyls · weidel-p · commit b2441ce440eb · 2025-10-21T02:43:28.000-07:00
In case of a `memref.copy` with dynamic dimensions, we currently
generate scalar code as the operation can't be vectorized:
```
memref.copy %source, %dest : memref&lt;?x4xf32&gt; to memref&lt;?x4xf32&gt;
```
This PR adds logic to tile it first as a `linalg.copy` to get static
copies if the dynamic dimension is not the innermost one. The static
`memref.copy` is then vectorized if possible.

Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
Signed-off-by: Philipp &lt;philipp.weidel@intel.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeCopyUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeCopyUsingForall.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -28,9 +29,6 @@ namespace {
 // transfer_read/transfer_write ops.
 //====---------------------------------------------------------------------===//
 
-// For optimal performance we always want to copy 128 bits
-static constexpr int kPreferredCopyNumBits = 128;
-
 // Moves the copy into a single threaded forall.
 static void distributeCopyToSingleThread(RewriterBase &rewriter,
                                          memref::CopyOp copy) {
@@ -113,20 +111,6 @@ static void distributeCopyToThreads(RewriterBase &rewriter, memref::CopyOp copy,
   rewriter.replaceOpWithNewOp<memref::CopyOp>(copy, sourceTile, targetTile);
 }
 
-static SmallVector<OpFoldResult> getCopyTileSizes(Builder &b,
-                                                  memref::CopyOp copy) {
-  int64_t rank = copy.getTarget().getType().getRank();
-  if (rank == 0) {
-    return {};
-  }
-
-  SmallVector<OpFoldResult> tileSizes(rank - 1, b.getIndexAttr(1));
-  int64_t elementBitWidth = llvm::cast<MemRefType>(copy.getTarget().getType())
-                                .getElementTypeBitWidth();
-  tileSizes.push_back(b.getIndexAttr(kPreferredCopyNumBits / elementBitWidth));
-  return tileSizes;
-}
-
 } // namespace
 
 namespace {
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorizeMemrefCopy.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorizeMemrefCopy.cpp
@@ -5,46 +5,113 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
+#define DEBUG_TYPE "iree-codegen-vectorize-memref-copy"
+
+constexpr char kIsTiled[] = "_is_tiled";
+
 namespace mlir::iree_compiler {
 
 #define GEN_PASS_DEF_VECTORIZEMEMREFCOPYPASS
 #include "iree/compiler/Codegen/Common/Passes.h.inc"
 
 namespace {
 
+struct TileLinalgCopy final : OpRewritePattern<memref::CopyOp> {
+  using Base::Base;
+  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    if (copyOp->hasAttr(kIsTiled)) {
+      return rewriter.notifyMatchFailure(copyOp, "already tiled");
+    }
+    auto linalgCopy = linalg::CopyOp::create(
+        rewriter, copyOp.getLoc(), copyOp.getSource(), copyOp.getTarget());
+    std::optional<SmallVector<int64_t>> maybeStaticTileSizes =
+        getCopyTileSizes(linalgCopy);
+    if (!maybeStaticTileSizes.has_value()) {
+      rewriter.eraseOp(linalgCopy);
+      return rewriter.notifyMatchFailure(copyOp,
+                                         "could not retrieve tile sizes");
+    }
+    SmallVector<int64_t> staticBounds = linalgCopy.getStaticLoopRanges();
+
+    auto tilingInterfaceOp = cast<TilingInterface>(linalgCopy.getOperation());
+    rewriter.setInsertionPoint(tilingInterfaceOp);
+    SmallVector<OpFoldResult> tileSizes = getAsIndexOpFoldResult(
+        rewriter.getContext(), maybeStaticTileSizes.value());
+
+    scf::SCFTilingOptions tilingOptions;
+    tilingOptions.setTileSizes(tileSizes);
+    tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
+
+    FailureOr<scf::SCFTilingResult> tilingResult =
+        scf::tileUsingSCF(rewriter, tilingInterfaceOp, tilingOptions);
+    if (failed(tilingResult)) {
+      return rewriter.notifyMatchFailure(copyOp, "tiling failed");
+    }
+    // Put an marker on the tiled ops, so it's easy to recognize that they
+    // shouldn't be tiled again.
+    for (Operation *tiledOp : tilingResult->tiledOps) {
+      tiledOp->setAttr(kIsTiled, mlir::UnitAttr::get(copyOp.getContext()));
+    }
+    if (tilingInterfaceOp->use_empty()) {
+      rewriter.eraseOp(tilingInterfaceOp);
+    }
+    rewriter.eraseOp(copyOp);
+    return success();
+  }
+};
+
 struct ConvertLinalgCopyToMemrefCopy final : OpRewritePattern<linalg::CopyOp> {
   using Base::Base;
   LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
                                 PatternRewriter &rewriter) const override {
     if (copyOp.hasPureTensorSemantics()) {
       return failure();
     }
-    memref::CopyOp::create(rewriter, copyOp.getLoc(),
-                           copyOp.getDpsInputOperand(0)->get(),
-                           copyOp.getDpsInitOperand(0)->get());
+    auto newCopy = memref::CopyOp::create(rewriter, copyOp.getLoc(),
+                                          copyOp.getDpsInputOperand(0)->get(),
+                                          copyOp.getDpsInitOperand(0)->get());
+    newCopy->setAttrs(copyOp->getAttrs());
     rewriter.eraseOp(copyOp);
     return success();
   }
 };
 
+/// TODO(#22245): Enable vector masking for unaligned/dynamic copies to improve
+/// copy performance further.
 struct VectorizeMemrefCopyPass final
     : impl::VectorizeMemrefCopyPassBase<VectorizeMemrefCopyPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect, vector::VectorDialect>();
+    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
+                    vector::VectorDialect>();
   }
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();
     auto funcOp = getOperation();
 
+    // First convert all `memref.copy` operations to `linalg.copy` so that they
+    // can be tiled. Tiling them avoids copies with dynamic dimensions if the
+    // dynamic dimension is not the innermost. Afterwards, tiled `linalg.copy`
+    // operations are converted back to `memref.copy` operations and vectorized.
     RewritePatternSet patterns(ctx);
+    patterns.add<TileLinalgCopy>(&getContext());
     patterns.add<linalg::CopyVectorizationPattern>(&getContext());
     patterns.add<ConvertLinalgCopyToMemrefCopy>(&getContext());
     (void)applyPatternsGreedily(funcOp, std::move(patterns));
+
+    // Clean up the temporary isTiled markers.
+    funcOp->walk([](memref::CopyOp copyOp) {
+      if (copyOp->hasAttr(kIsTiled)) {
+        copyOp->removeAttr(kIsTiled);
+      }
+    });
   }
 };
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vectorize_memref_copy.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vectorize_memref_copy.mlir
@@ -8,8 +8,14 @@ func.func @memref_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
 // CHECK-LABEL: func.func @memref_copy
 //  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
-//       CHECK:   %[[RD:.+]] = vector.transfer_read %[[SOURCE]]
-//       CHECK:   vector.transfer_write %[[RD]], %[[DEST]]
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
+//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
+//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
 
 // -----
 
@@ -21,5 +27,95 @@ func.func @linalg_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
 // CHECK-LABEL: func.func @linalg_copy
 //  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
 //  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
-//       CHECK:   %[[RD:.+]] = vector.transfer_read %[[SOURCE]]
-//       CHECK:   vector.transfer_write %[[RD]], %[[DEST]]
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
+//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
+//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
+
+// -----
+
+// Test with the last dimension larger than and not a multiple of the preferred number of copy elements.
+
+func.func @memref_copy_not_multiple_of_preferred(%source: memref<2x6xf32>, %dest: memref<2x6xf32>) {
+  memref.copy %source, %dest : memref<2x6xf32> to memref<2x6xf32>
+  return
+}
+// CHECK-LABEL: func.func @memref_copy_not_multiple_of_preferred
+//  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<2x6xf32>
+//  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<2x6xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+//   CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C6]] step %[[C4]]
+//       CHECK:       %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 6, 4)>(%[[ARG3]])
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
+//       CHECK:       memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
+
+// -----
+
+// Test with the penultimate dimension larger than and not a multiple of the preferred number of copy elements on that dimension.
+
+func.func @memref_copy_not_multiple_on_penultimate_dim(%source: memref<3x2xf32>, %dest: memref<3x2xf32>) {
+  memref.copy %source, %dest : memref<3x2xf32> to memref<3x2xf32>
+  return
+}
+// CHECK-LABEL: func.func @memref_copy_not_multiple_on_penultimate_dim
+//  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<3x2xf32>
+//  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<3x2xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C3]] step %[[C2]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
+//       CHECK:       %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
+//       CHECK:       memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
+
+// -----
+
+func.func @memref_copy_dynamic(%source: memref<?x4xf32>, %dest: memref<?x4xf32>) {
+  memref.copy %source, %dest : memref<?x4xf32> to memref<?x4xf32>
+  return
+}
+// CHECK-LABEL: func.func @memref_copy_dynamic
+//  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<?x4xf32>
+//  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<?x4xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+//   CHECK-DAG:   %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C0]] : memref<?x4xf32>
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[DIM]] step %[[C1]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C4]] step %[[C4]]
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
+//       CHECK:       %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
+//       CHECK:       vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
+
+// -----
+
+func.func @memref_copy_dynamic_inner_dim(%source: memref<4x?xf32>, %dest: memref<4x?xf32>) {
+  memref.copy %source, %dest : memref<4x?xf32> to memref<4x?xf32>
+  return
+}
+// CHECK-LABEL: func.func @memref_copy_dynamic_inner_dim
+//  CHECK-SAME:   %[[SOURCE:[A-Za-z0-9]+]]: memref<4x?xf32>
+//  CHECK-SAME:   %[[DEST:[A-Za-z0-9]+]]: memref<4x?xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+//   CHECK-DAG:   %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C1]] : memref<4x?xf32>
+//       CHECK:   scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]]
+//       CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[DIM]] step %[[C4]]
+//       CHECK:       %[[MIN:.+]] = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%[[ARG3]])[%[[DIM]]]
+//       CHECK:       %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
+//       CHECK:       %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
+//       CHECK:       memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -231,8 +231,7 @@ hal.executable private @main {
 //          CHECK:       gpu.barrier
 //      CHECK-DAG:       %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16>
 //      CHECK-DAG:       %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4xf16>
-// CHECK-COUNT-1:       amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
-//      CHECK-NOT:     scf.for
+//  CHECK-COUNT-1:       amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1281,22 +1281,21 @@ hal.executable public @main {
   }
 }
 
-// CHECK-LABEL: func @unaligned_to_intrinsic_batched_matmul_nocpromo
-//   CHECK-NOT:   memref.alloc() {{.*}}xf32
-//   CHECK-DAG:   memref.alloc() : memref<1x4x66xf32, #gpu.address_space<workgroup>>
-//   CHECK-DAG:   memref.alloc() : memref<1x16x6xf32, #gpu.address_space<workgroup>>
-//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
-//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
-//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
-//       CHECK:   scf.forall ({{.*}}) in (12, 37, 10) {
-//       CHECK:     scf.for %[[IV:.+]] = %c0 to %c144 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
-//       CHECK:       gpu.barrier
-//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
-//   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
+//   CHECK-LABEL: func @unaligned_to_intrinsic_batched_matmul_nocpromo
+//     CHECK-NOT:   memref.alloc() {{.*}}xf32
+//     CHECK-DAG:   memref.alloc() : memref<1x4x66xf32, #gpu.address_space<workgroup>>
+//     CHECK-DAG:   memref.alloc() : memref<1x16x6xf32, #gpu.address_space<workgroup>>
+//     CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
+//     CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
+//     CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//         CHECK:   scf.forall ({{.*}}) in (12, 37, 10) {
+//         CHECK:     scf.for %[[IV:.+]] = %c0 to %c144 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
+//         CHECK:       gpu.barrier
+//     CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
+//     CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
 // CHECK-COUNT-1:     amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32
-//       CHECK:       scf.yield
-//   CHECK-NOT:     scf.for
-//       CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+//         CHECK:     scf.yield
+//         CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h