[CPU][DT] Implement data layout propagation for CPU dispatches. (#21554)

hanhanW · web-flow · commit 8e374a65774c · 2025-08-05T00:30:20.000Z
The revision implements a specialized propagation pattern for `tensor.collapse_shape->linalg.unpack`, if the packed dimension and the corresponding inner dimension are collapsed. In the data layout propagation, we also populate patterns that sink down tensor.collapse_shape across `linalg.generic` ops. Because how we materialize matvec in CPU backends is converting it to `linalg.mmt4d->tensor.collapse_shape` op chain. At the end, the pass folds the reshapes into bindings. Fixes #21180 --------- Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/CPU/BUILD.bazel
@@ -51,6 +51,7 @@ iree_compiler_cc_library(
     srcs = [
         "CPULowerToUKernels.cpp",
         "CPUPrepareUkernels.cpp",
+        "CPUPropagateDataLayout.cpp",
         "Passes.cpp",
     ],
     hdrs = [
@@ -78,6 +79,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:DestinationStyleOpInterface",
+        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CPU/CMakeLists.txt
@@ -43,6 +43,7 @@ iree_cc_library(
   SRCS
     "CPULowerToUKernels.cpp"
     "CPUPrepareUkernels.cpp"
+    "CPUPropagateDataLayout.cpp"
     "Passes.cpp"
   DEPS
     ::PassHeaders
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUPropagateDataLayout.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUPropagateDataLayout.cpp
@@ -0,0 +1,182 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/CPU/Passes.h"
+#include "iree/compiler/Codegen/Common/Transforms.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_CPUPROPAGATEDATALAYOUTPASS
+#include "iree/compiler/Codegen/Common/CPU/Passes.h.inc"
+
+namespace {
+
+/// Sinks down tensor.collapse_shape across linalg.unpack op, if the collapsing
+/// dims are two unit dims where one is outer dimension and the other is inner
+/// dimension. It implies that we swap two operations by adjusting the packing
+/// metadata in linalg.unpack op.
+/// Note that the pattern only supports the case where the destination tensor of
+/// linalg.unpack op is a tensor.empty op. The constraint can be removed by
+/// introducing tensor.expand_shape op on the destination tensor. However, it is
+/// not common in practice, so it is not supported now.
+struct SinkDownCollapsingUnitDimsAcrossUnpack final
+    : public OpRewritePattern<linalg::UnPackOp> {
+  using OpRewritePattern<linalg::UnPackOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(linalg::UnPackOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!isIdentityPermutation(op.getOuterDimsPerm())) {
+      return rewriter.notifyMatchFailure(
+          op, "expected identity (or unset) outer permutation");
+    }
+    if (op.getSourceRank() != op.getDestRank() + 1) {
+      return rewriter.notifyMatchFailure(
+          op, "expected unpacking exactly one dimension");
+    }
+    auto emptyOp = op.getDest().getDefiningOp<tensor::EmptyOp>();
+    if (!emptyOp) {
+      return rewriter.notifyMatchFailure(
+          op, "expected destination to be a tensor.empty op");
+    }
+    auto collapseOp = op.getSource().getDefiningOp<tensor::CollapseShapeOp>();
+    if (!collapseOp) {
+      return rewriter.notifyMatchFailure(
+          op, "expected the source to be a tensor.collapse_shape op");
+    }
+
+    SmallVector<ReassociationIndices, 4> ri =
+        collapseOp.getReassociationIndices();
+    ReassociationIndices outerRi, innerRi;
+    for (ArrayRef<int64_t> indices : ri) {
+      if (indices.size() == 1) {
+        continue;
+      }
+      if (indices.size() > 2) {
+        return rewriter.notifyMatchFailure(
+            op, "expected re-association map to have two dimensions");
+      }
+      if (outerRi.empty()) {
+        outerRi.assign(indices.begin(), indices.end());
+        continue;
+      }
+      if (innerRi.empty()) {
+        innerRi.assign(indices.begin(), indices.end());
+        continue;
+      }
+      return rewriter.notifyMatchFailure(
+          op, "expected only two re-association maps to have two dimensions");
+    }
+    if (outerRi.empty() || innerRi.empty()) {
+      return rewriter.notifyMatchFailure(
+          op, "expected only two re-association maps to have two dimensions");
+    }
+
+    RankedTensorType srcType = collapseOp.getSrcType();
+    if (innerRi.back() != srcType.getRank() - 1) {
+      return rewriter.notifyMatchFailure(
+          op, "expected that the two innermost dimensions are collapsed");
+    }
+    SmallVector<int64_t> innerDimPos(op.getInnerDimsPos());
+    if (!llvm::is_contained(outerRi, innerDimPos[0])) {
+      return rewriter.notifyMatchFailure(
+          op, "expected the packed dimension is collapsed");
+    }
+
+    bool missLeadingUnitDim = srcType.getDimSize(outerRi[0]) == 1 &&
+                              srcType.getDimSize(innerRi[0]) == 1;
+    bool missTrailingUnitDim = srcType.getDimSize(outerRi[1]) == 1 &&
+                               srcType.getDimSize(innerRi[1]) == 1;
+    if (!missLeadingUnitDim && !missTrailingUnitDim) {
+      return rewriter.notifyMatchFailure(op,
+                                         "expected collapsing either leading "
+                                         "unit dims or trailing outer dims");
+    }
+
+    // We either add unit dims right before or after the packed dimensions.
+    // E.g., AxBxNxCxDxn becomes AxBx1xNxCxDx1xn if `missLeadingUnitDim` is
+    // true. It becomes AxBxNx1xCxDxnx1 if `missingTrailingUnitDim` is true.
+    // If both are true, the former is prioritized because it does not matter in
+    // practice.
+    SmallVector<OpFoldResult> innerTiles(op.getMixedTiles());
+    SmallVector<OpFoldResult> destShape = emptyOp.getMixedSizes();
+    if (missLeadingUnitDim) {
+      // The unit dim is inserted before the packed dimension, so we advance one
+      // for innerDimPos[0].
+      innerDimPos[0]++;
+      innerDimPos.insert(innerDimPos.begin(), outerRi[0]);
+      innerTiles.insert(innerTiles.begin(), rewriter.getIndexAttr(1));
+      destShape.insert(destShape.begin() + outerRi[0],
+                       rewriter.getIndexAttr(1));
+    } else {
+      innerDimPos.insert(innerDimPos.end(), outerRi[1]);
+      innerTiles.insert(innerTiles.end(), rewriter.getIndexAttr(1));
+      destShape.insert(destShape.end(), rewriter.getIndexAttr(1));
+    }
+
+    Location loc = op.getLoc();
+    auto newDestOp = rewriter.create<tensor::EmptyOp>(
+        loc, destShape, emptyOp.getType().getElementType());
+    auto newUnpackOp = rewriter.create<linalg::UnPackOp>(
+        loc, collapseOp.getSrc(), newDestOp, innerDimPos, innerTiles);
+    SmallVector<ReassociationIndices> newRi;
+    for (int64_t i = 0, e = op.getDestRank(); i < e; ++i) {
+      if (i == outerRi[0]) {
+        newRi.push_back(outerRi);
+        ++i;
+      } else {
+        newRi.push_back({i});
+      }
+    }
+    rewriter.replaceOpWithNewOp<tensor::CollapseShapeOp>(
+        op, newUnpackOp.getResult(), newRi);
+
+    return success();
+  }
+};
+
+struct CPUPropagateDataLayoutPass final
+    : public impl::CPUPropagateDataLayoutPassBase<CPUPropagateDataLayoutPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void CPUPropagateDataLayoutPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  FunctionOpInterface funcOp = getOperation();
+  RewritePatternSet patterns(ctx);
+  patterns.insert<SinkDownCollapsingUnitDimsAcrossUnpack>(ctx);
+  populateReshapeToInterfaceTensorPatterns(patterns);
+  tensor::populateFoldTensorEmptyPatterns(patterns, /*foldSingleUseOnly=*/1);
+  linalg::populateFoldReshapeOpsByExpansionPatterns(
+      patterns, [](OpOperand *fusedOperand) -> bool {
+        Operation *producer = fusedOperand->get().getDefiningOp();
+        auto consumerGenericOp =
+            dyn_cast_if_present<linalg::GenericOp>(fusedOperand->getOwner());
+        if (!isa<tensor::CollapseShapeOp>(producer) || !consumerGenericOp) {
+          return false;
+        }
+        return true;
+      });
+  if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.td
@@ -30,4 +30,10 @@ def CPUPrepareUkernelsPass :
                 "For example, batch_mmt4d ops are decomposed to mmt4d ops";
 }
 
+def CPUPropagateDataLayoutPass :
+    InterfacePass<"iree-codegen-cpu-propagate-data-layout", "mlir::FunctionOpInterface"> {
+  let summary = "Propagates pack/unpack/reshape ops to make the whole dispatch "
+                "use the same layout.";
+}
+
 #endif  // IREE_CODEGEN_COMMON_CPU_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/CPU/test/BUILD.bazel
@@ -21,6 +21,7 @@ iree_lit_test_suite(
         [
             "lower_to_ukernel_ops.mlir",
             "prepare_ukernels.mlir",
+            "propagate_data_layout.mlir",
         ],
         include = ["*.mlir"],
     ),
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CPU/test/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_lit_test_suite(
   SRCS
     "lower_to_ukernel_ops.mlir"
     "prepare_ukernels.mlir"
+    "propagate_data_layout.mlir"
   TOOLS
     FileCheck
     iree-opt
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/propagate_data_layout.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/propagate_data_layout.mlir
@@ -0,0 +1,117 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-cpu-propagate-data-layout))" --split-input-file %s | FileCheck %s
+
+func.func @collapsing_unit_dim_0(%src: tensor<1x2x1x16xi32>) -> tensor<20xi32> {
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2, 3]] : tensor<1x2x1x16xi32> into tensor<2x16xi32>
+  %1 = tensor.empty() : tensor<20xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [0] inner_tiles = [16] into %1 : tensor<2x16xi32> -> tensor<20xi32>
+  return %unpack : tensor<20xi32>
+}
+// CHECK-LABEL: func.func @collapsing_unit_dim_0(
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[SRC]]
+// CHECK-SAME:      inner_dims_pos = [0, 1] inner_tiles = [1, 16]
+// CHECK-SAME:      : tensor<1x2x1x16xi32> -> tensor<1x20xi32>
+// CHECK-NEXT:    %[[COLLAPSED:.+]] = tensor.collapse_shape %[[UNPACK]]
+// CHECK-SAME:      : tensor<1x20xi32> into tensor<20xi32>
+// CHECK:         return %[[COLLAPSED]]
+
+// -----
+
+func.func @collapsing_unit_dim_1(%src: tensor<?x1x1x1x16xi32>, %batch_size: index) -> tensor<?x3xi32> {
+  %collapsed = tensor.collapse_shape %src [[0], [1, 2], [3, 4]] : tensor<?x1x1x1x16xi32> into tensor<?x1x16xi32>
+  %0 = tensor.empty(%batch_size) : tensor<?x3xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [1] inner_tiles = [16] into %0 : tensor<?x1x16xi32> -> tensor<?x3xi32>
+  return %unpack : tensor<?x3xi32>
+}
+// CHECK-LABEL: func.func @collapsing_unit_dim_1(
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[SRC]]
+// CHECK-SAME:      inner_dims_pos = [1, 2] inner_tiles = [1, 16]
+// CHECK-SAME:      : tensor<?x1x1x1x16xi32> -> tensor<?x1x3xi32>
+// CHECK-NEXT:    %[[COLLAPSED:.+]] = tensor.collapse_shape %[[UNPACK]]
+// CHECK-SAME:      : tensor<?x1x3xi32> into tensor<?x3xi32>
+// CHECK:         return %[[COLLAPSED]]
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @collapsing_unit_dim_0_elem_unpack(%src: tensor<1x1x1x16xi32>) -> tensor<3xi32> {
+  %0 = tensor.empty() : tensor<1x16xi32>
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2, 3]] : tensor<1x1x1x16xi32> into tensor<1x16xi32>
+  %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%collapsed : tensor<1x16xi32>) outs(%0 : tensor<1x16xi32>) {
+  ^bb0(%in: i32, %out: i32):
+    %3 = arith.addi %in, %in : i32
+    linalg.yield %3 : i32
+  } -> tensor<1x16xi32>
+  %2 = tensor.empty() : tensor<3xi32>
+  %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %2 : tensor<1x16xi32> -> tensor<3xi32>
+  return %unpack : tensor<3xi32>
+}
+// CHECK-LABEL: func.func @collapsing_unit_dim_0_elem_unpack(
+// CHECK:         %[[ELEM:.+]] = linalg.generic
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[ELEM]]
+// CHECK-SAME:      inner_dims_pos = [0, 1] inner_tiles = [1, 16]
+// CHECK-SAME:      : tensor<1x1x1x16xi32> -> tensor<1x3xi32>
+// CHECK-NEXT:    %[[COLLAPSED:.+]] = tensor.collapse_shape %[[UNPACK]]
+// CHECK-SAME:      : tensor<1x3xi32> into tensor<3xi32>
+// CHECK:         return %[[COLLAPSED]]
+
+// -----
+
+func.func @negative_unpack_with_outer_dims_perm(%src: tensor<1x1x?x1x16xi32>, %batch_size: index) -> tensor<?x3xi32> {
+  %collapsed = tensor.collapse_shape %src [[0], [1, 2], [3, 4]] : tensor<1x1x?x1x16xi32> into tensor<1x?x16xi32>
+  %0 = tensor.empty(%batch_size) : tensor<?x3xi32>
+  %unpack = linalg.unpack %collapsed outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [16] into %0 : tensor<1x?x16xi32> -> tensor<?x3xi32>
+  return %unpack : tensor<?x3xi32>
+}
+// CHECK-LABEL: func.func @negative_unpack_with_outer_dims_perm(
+// CHECK:         tensor.collapse_shape
+// CHECK:         linalg.unpack
+
+// -----
+
+func.func @negative_unpack_multiple_dims(%src: tensor<?x1x1x1x16x8xi32>, %d0: index, %d1: index) -> tensor<?x?xi32> {
+  %collapsed = tensor.collapse_shape %src [[0], [1, 2], [3, 4], [5]] : tensor<?x1x1x1x16x8xi32> into tensor<?x1x16x8xi32>
+  %0 = tensor.empty(%d0, %d1) : tensor<?x?xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [0, 1] inner_tiles = [16, 8] into %0 : tensor<?x1x16x8xi32> -> tensor<?x?xi32>
+  return %unpack : tensor<?x?xi32>
+}
+// CHECK-LABEL: func.func @negative_unpack_multiple_dims(
+// CHECK:         tensor.collapse_shape
+// CHECK:         linalg.unpack
+
+// -----
+
+func.func @negative_unpack_non_collapsed_dim(%src: tensor<?x1x1x1x16xi32>, %d0: index) -> tensor<?x1xi32> {
+  %collapsed = tensor.collapse_shape %src [[0], [1, 2], [3, 4]] : tensor<?x1x1x1x16xi32> into tensor<?x1x16xi32>
+  %0 = tensor.empty(%d0) : tensor<?x1xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [0] inner_tiles = [16] into %0 : tensor<?x1x16xi32> -> tensor<?x1xi32>
+  return %unpack : tensor<?x1xi32>
+}
+// CHECK-LABEL: func.func @negative_unpack_non_collapsed_dim(
+// CHECK:         tensor.collapse_shape
+// CHECK:         linalg.unpack
+
+// -----
+
+func.func @negative_both_m_n_non_unit_dim(%src: tensor<3x4x2x8xi32>) -> tensor<180xi32> {
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2, 3]] : tensor<3x4x2x8xi32> into tensor<12x16xi32>
+  %1 = tensor.empty() : tensor<180xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [0] inner_tiles = [16] into %1 : tensor<12x16xi32> -> tensor<180xi32>
+  return %unpack : tensor<180xi32>
+}
+// CHECK-LABEL: func.func @negative_both_m_n_non_unit_dim(
+// CHECK:         tensor.collapse_shape
+// CHECK:         linalg.unpack
+
+// -----
+
+func.func @negative_innermost_dim_is_not_collapsed(%src: tensor<1x3x1x8x16xi32>) -> tensor<48x8xi32> {
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2, 3], [4]] : tensor<1x3x1x8x16xi32> into tensor<3x8x16xi32>
+  %1 = tensor.empty() : tensor<48x8xi32>
+  %unpack = linalg.unpack %collapsed inner_dims_pos = [0] inner_tiles = [16] into %1 : tensor<3x8x16xi32> -> tensor<48x8xi32>
+  return %unpack : tensor<48x8xi32>
+}
+// CHECK-LABEL: func.func @negative_innermost_dim_is_not_collapsed(
+// CHECK:         tensor.collapse_shape
+// CHECK:         linalg.unpack
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -843,6 +843,7 @@ void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
       // way to late and should insted be be done during lowering to LLVM.
       .addPass(createExpandF16OpToF32Pass)
       .addPass(createMaterializeDeviceEncodingPass)
+      .addPass(createCPUPropagateDataLayoutPass)
       .addPass(createConvertAccGEMMToGEMMPass)
       // TODO: Remove the following pass the plumb support for
       // #hal.descriptor_type memory space through the stack.
diff --git a/tests/e2e/linalg/BUILD.bazel b/tests/e2e/linalg/BUILD.bazel
@@ -51,6 +51,23 @@ iree_check_single_backend_test_suite(
     target_backend = "llvm-cpu",
 )
 
+# TODO(#19378): Delete the test suite once data-tiling fusion is default on.
+iree_check_single_backend_test_suite(
+    name = "check_llvm-cpu_dt_fusion_local-task",
+    srcs = ["narrow_n_matmuls.mlir"],
+    compiler_flags = [
+        "--iree-dispatch-creation-experimental-data-tiling",
+        "--iree-llvmcpu-target-cpu=generic",
+        "--iree-opt-data-tiling=false",
+    ],
+    driver = "local-task",
+    tags = [
+        # subbyte support for wasm is not on priorities.
+        "nowasm",
+    ],
+    target_backend = "llvm-cpu",
+)
+
 VMVX_SRCS = enforce_glob(
     # keep sorted
     [
diff --git a/tests/e2e/linalg/CMakeLists.txt b/tests/e2e/linalg/CMakeLists.txt

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ iree_lit_test_suite(`
`21`	`21`	`[`
`22`	`22`	`"lower_to_ukernel_ops.mlir",`
`23`	`23`	`"prepare_ukernels.mlir",`
	`24`	`+ "propagate_data_layout.mlir",`
`24`	`25`	`],`
`25`	`26`	`include = ["*.mlir"],`
`26`	`27`	`),`