[GlobalOptimizations] Add a pass to simplify strided contraction-like ops (#20607)

zjgarvey · web-flow · commit 1a8d229431e6 · 2025-04-24T13:34:01.000-05:00
This PR adds a pattern to manipulate generic ops which satisfy all "contraction" conditions except for the indexing maps being projected permutations. Namely, if the input indexing map has results of the form `dim * cst`, this pattern will factor the original generic op into `tensor.extract_slice + contraction linalg.generic`. This addresses #20600 . Each of the included lit test examples were not compiling to mfma instructions before this patch. --------- Signed-off-by: zjgarvey <zjgarvey@gmail.com>
diff --git a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
@@ -46,6 +46,7 @@ iree_compiler_cc_library(
     srcs = [
         "CleanupNumericNarrowing.cpp",
         "Convert1X1FilterConv2DToMatmul.cpp",
+        "ConvertStridedContractionToContraction.cpp",
         "DataLayoutPropagation.cpp",
         "DecomposeConcat.cpp",
         "DemoteContractionInputsToBF16.cpp",
diff --git a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
@@ -42,6 +42,7 @@ iree_cc_library(
   SRCS
     "CleanupNumericNarrowing.cpp"
     "Convert1X1FilterConv2DToMatmul.cpp"
+    "ConvertStridedContractionToContraction.cpp"
     "DataLayoutPropagation.cpp"
     "DecomposeConcat.cpp"
     "DemoteContractionInputsToBF16.cpp"
diff --git a/compiler/src/iree/compiler/GlobalOptimization/ConvertStridedContractionToContraction.cpp b/compiler/src/iree/compiler/GlobalOptimization/ConvertStridedContractionToContraction.cpp
@@ -0,0 +1,145 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/GlobalOptimization/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir::iree_compiler::GlobalOptimization {
+
+#define GEN_PASS_DEF_CONVERTSTRIDEDCONTRACTIONTOCONTRACTIONPASS
+#include "iree/compiler/GlobalOptimization/Passes.h.inc"
+
+namespace {
+
+class ConvertStridedContractionToContraction
+    : public OpRewritePattern<linalg::GenericOp> {
+public:
+  using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(linalg::GenericOp op,
+                                PatternRewriter &rewriter) const override {
+    // Check if the generic op satisfies all other conditions for being a
+    // contraction.
+    if (op.getNumDpsInputs() != 2 || op.getNumDpsInits() != 1)
+      return failure();
+    if (op.getNumReductionLoops() == 0)
+      return failure();
+    if (!mlir::linalg::detail::isContractionBody(
+            *op.getBlock(), [](Operation *first, Operation *second) {
+              if ((isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
+                  (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)))
+                return true;
+              return false;
+            })) {
+      return failure();
+    }
+
+    SmallVector<AffineMap> mapRange = op.getIndexingMapsArray();
+    unsigned inputPos = op.getDpsInputOperand(0)->getOperandNumber();
+    unsigned filterPos = op.getDpsInputOperand(1)->getOperandNumber();
+    unsigned resInitPos = op.getDpsInitOperand(0)->getOperandNumber();
+    AffineMap inputMap = mapRange[inputPos];
+    AffineMap filterMap = mapRange[filterPos];
+    AffineMap resultMap = mapRange[resInitPos];
+    // For now, we are only handling the case where the first input is the
+    // only non-projected permutation.
+    if (!filterMap.isProjectedPermutation() ||
+        !resultMap.isProjectedPermutation()) {
+      return failure();
+    }
+    if (inputMap.isProjectedPermutation())
+      return failure();
+    SmallVector<int64_t, 4> staticShape = op.getStaticLoopRanges();
+
+    llvm::SmallDenseMap<unsigned, int64_t> strides;
+    SmallVector<AffineExpr> replacementExprs;
+    Value input = op.getDpsInputs()[0];
+    auto inputTy = dyn_cast<RankedTensorType>(input.getType());
+    if (!inputTy)
+      return failure();
+    SmallVector<int64_t> inputShape(inputTy.getShape());
+    replacementExprs.reserve(inputMap.getNumResults());
+    // Walk through input map and look for expressions of the form `dim * cst`.
+    for (auto [pos, expr] : llvm::enumerate(inputMap.getResults())) {
+      // Skip dim exprs and constant exprs.
+      if (isa<AffineDimExpr>(expr) || isa<AffineConstantExpr>(expr)) {
+        replacementExprs.push_back(expr);
+        continue;
+      }
+      // Look at binary op expressions.
+      auto binexpr = dyn_cast<AffineBinaryOpExpr>(expr);
+      // Fail if we see some unexpected kind of expression.
+      if (!binexpr)
+        return failure();
+      auto rhs = dyn_cast<AffineConstantExpr>(binexpr.getRHS());
+      auto lhs = dyn_cast<AffineDimExpr>(binexpr.getLHS());
+      // Binary expressions must be of the form `dim * cst`.
+      if (!rhs || !lhs || binexpr.getKind() != AffineExprKind::Mul) {
+        replacementExprs.push_back(expr);
+        continue;
+      }
+      strides.insert(std::pair<unsigned, int64_t>(pos, rhs.getValue()));
+      int64_t newSize = staticShape[lhs.getPosition()];
+      if (newSize == ShapedType::kDynamic || newSize == 0)
+        return failure();
+      inputShape[pos] = newSize;
+      replacementExprs.push_back(lhs);
+    }
+
+    // Fail if we don't have any work to do.
+    if (strides.empty())
+      return failure();
+
+    mapRange[inputPos] =
+        AffineMap::get(inputMap.getNumDims(), inputMap.getNumSymbols(),
+                       replacementExprs, op.getContext());
+    auto sliceTy = RankedTensorType::get(inputShape, inputTy.getElementType());
+
+    unsigned rank = inputTy.getRank();
+    SmallVector<OpFoldResult> vOffset(rank, rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> vSizes;
+    SmallVector<OpFoldResult> vStride(rank, rewriter.getIndexAttr(1));
+    Location loc = op.getLoc();
+    for (unsigned i = 0; i < inputTy.getRank(); i++) {
+      if (strides.contains(i)) {
+        vStride[i] = rewriter.getIndexAttr(strides.at(i));
+      }
+      if (inputShape[i] != ShapedType::kDynamic) {
+        vSizes.push_back(rewriter.getIndexAttr(inputShape[i]));
+        continue;
+      }
+      vSizes.push_back(rewriter.createOrFold<tensor::DimOp>(loc, input, i));
+    }
+    Value extractedSlice = rewriter.create<tensor::ExtractSliceOp>(
+        loc, sliceTy, input, vOffset, vSizes, vStride);
+    rewriter.startOpModification(op);
+    op.setIndexingMapsAttr(rewriter.getAffineMapArrayAttr(mapRange));
+    op.setOperand(0, extractedSlice);
+    rewriter.finalizeOpModification(op);
+    return success();
+  }
+};
+
+struct ConvertStridedContractionToContractionPass
+    : public impl::ConvertStridedContractionToContractionPassBase<
+          ConvertStridedContractionToContractionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(&getContext());
+    patterns.insert<ConvertStridedContractionToContraction>(context);
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+  }
+};
+} // namespace
+} // namespace mlir::iree_compiler::GlobalOptimization
diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp
@@ -155,6 +155,8 @@ void buildGlobalOptimizationPassPipeline(
       });
 
   mainPassManager.addPass(DispatchCreation::createFoldUnitExtentDimsPass());
+  mainPassManager.addPass(
+      GlobalOptimization::createConvertStridedContractionToContractionPass());
   FunctionLikeNest(mainPassManager)
       .addPredicatedPass(clEnableFuseSiluHorizontalMatmul,
                          createFuseSiluHorizontalMatmulPass)
diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.td b/compiler/src/iree/compiler/GlobalOptimization/Passes.td
@@ -19,6 +19,11 @@ def Convert1X1FilterConv2DToMatmulPass:
   let summary = "Convert linalg convolution ops with 1x1 kernels into linalg matrix multiplication ops.";
 }
 
+def ConvertStridedContractionToContractionPass:
+    Pass<"iree-global-opt-convert-strided-contraction-to-contraction", ""> {
+  let summary = "Factors out an extract_slice from contraction-like ops with strided inputs.";
+}
+
 def DecomposeConcatPass :
     Pass<"iree-global-opt-decompose-concat", ""> {
   let summary = "Decomposes concatenations into a destination and a sequence of slice inserts.";
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
@@ -34,6 +34,7 @@ iree_lit_test_suite(
             "propagate_linalg_transpose.mlir",
             "raise_special_ops.mlir",
             "remove_zero_extent_tensors.mlir",
+            "strided_contraction_to_contraction.mlir",
             "transformation_pipeline.mlir",
             "transpose_and_decompose_concat.mlir",
             "warn_on_uninitialized_values.mlir",
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/CMakeLists.txt b/compiler/src/iree/compiler/GlobalOptimization/test/CMakeLists.txt
@@ -32,6 +32,7 @@ iree_lit_test_suite(
     "propagate_linalg_transpose.mlir"
     "raise_special_ops.mlir"
     "remove_zero_extent_tensors.mlir"
+    "strided_contraction_to_contraction.mlir"
     "transformation_pipeline.mlir"
     "transpose_and_decompose_concat.mlir"
     "warn_on_uninitialized_values.mlir"
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/strided_contraction_to_contraction.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/strided_contraction_to_contraction.mlir
@@ -0,0 +1,117 @@
+// RUN: iree-opt --split-input-file --mlir-print-local-scope -iree-global-opt-convert-strided-contraction-to-contraction %s | FileCheck %s
+
+util.func public @strided_from_output_static(%input: tensor<2x118x182x448xbf16>, %filter: tensor<896x448xbf16>) -> tensor<2x59x91x896xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x59x91x896xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x59x91x896xf32>) -> tensor<2x59x91x896xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, 2 * d1, d2 * 2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%input, %filter : tensor<2x118x182x448xbf16>, tensor<896x448xbf16>) outs(%1 : tensor<2x59x91x896xf32>) {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %3 = arith.extf %in : bf16 to f32
+    %4 = arith.extf %in_0 : bf16 to f32
+    %5 = arith.mulf %3, %4 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<2x59x91x896xf32>
+  util.return %2 : tensor<2x59x91x896xf32>
+}
+
+// CHECK-LABEL: @strided_from_output_static(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<2x118x182x448xbf16>
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<896x448xbf16>
+// CHECK: %[[SLICE:.*]] = tensor.extract_slice %[[INPUT]][0, 0, 0, 0] [2, 59, 91, 448] [1, 2, 2, 1]
+// CHECK-SAME:      tensor<2x118x182x448xbf16> to tensor<2x59x91x448xbf16>
+// CHECK: %[[GEN:.*]] = linalg.generic
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
+// CHECK-SAME:      ins(%[[SLICE]], %[[FILTER]]
+// CHECK: util.return %[[GEN]]
+
+
+// -----
+
+util.func public @strided_from_output_dynamic_batch(%input: tensor<?x118x182x448xbf16>, %filter: tensor<896x448xbf16>) -> tensor<?x59x91x896xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %input, %c0 : tensor<?x118x182x448xbf16>
+  %0 = tensor.empty(%dim) : tensor<?x59x91x896xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x59x91x896xf32>) -> tensor<?x59x91x896xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1 * 2, d2 * 2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%input, %filter : tensor<?x118x182x448xbf16>, tensor<896x448xbf16>) outs(%1 : tensor<?x59x91x896xf32>) {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %3 = arith.extf %in : bf16 to f32
+    %4 = arith.extf %in_0 : bf16 to f32
+    %5 = arith.mulf %3, %4 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<?x59x91x896xf32>
+  util.return %2 : tensor<?x59x91x896xf32>
+}
+
+// CHECK-LABEL: @strided_from_output_dynamic_batch(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x118x182x448xbf16>
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<896x448xbf16>
+// CHECK: %[[SLICE:.*]] = tensor.extract_slice %[[INPUT]][0, 0, 0, 0] [%[[DIM:.*]], 59, 91, 448] [1, 2, 2, 1]
+// CHECK-SAME:      tensor<?x118x182x448xbf16> to tensor<?x59x91x448xbf16>
+// CHECK: %[[GEN:.*]] = linalg.generic
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
+// CHECK-SAME:      ins(%[[SLICE]], %[[FILTER]]
+// CHECK: util.return %[[GEN]]
+
+// -----
+
+util.func public @strided_from_output_partial_conv(%input: tensor<2x118x182x448xbf16>, %filter: tensor<896x2x448xbf16>) -> tensor<2x59x91x896xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x59x91x896xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x59x91x896xf32>) -> tensor<2x59x91x896xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 * 2, d2 * 2 + d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%input, %filter : tensor<2x118x182x448xbf16>, tensor<896x2x448xbf16>) outs(%1 : tensor<2x59x91x896xf32>) {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %3 = arith.extf %in : bf16 to f32
+    %4 = arith.extf %in_0 : bf16 to f32
+    %5 = arith.mulf %3, %4 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<2x59x91x896xf32>
+  util.return %2 : tensor<2x59x91x896xf32>
+}
+
+// CHECK-LABEL: @strided_from_output_partial_conv
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<2x118x182x448xbf16>
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<896x2x448xbf16>
+// CHECK: %[[SLICE:.*]] = tensor.extract_slice %[[INPUT]][0, 0, 0, 0] [2, 59, 182, 448] [1, 2, 1, 1]
+// CHECK-SAME:     tensor<2x118x182x448xbf16> to tensor<2x59x182x448xbf16>
+// CHECK: linalg.generic
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d5)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
+// CHECK-SAME:      ins(%[[SLICE]], %[[FILTER]]
+
+// -----
+
+util.func public @strided_from_filter_static(%input: tensor<896x118x16xbf16>, %filter: tensor<448x59x16xbf16>) -> tensor<896x448xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<896x448xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<896x448xf32>) -> tensor<896x448xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2 * 2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%input, %filter : tensor<896x118x16xbf16>, tensor<448x59x16xbf16>) outs(%1 : tensor<896x448xf32>) {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %3 = arith.extf %in : bf16 to f32
+    %4 = arith.extf %in_0 : bf16 to f32
+    %5 = arith.mulf %3, %4 : f32
+    %6 = arith.addf %out, %5 : f32
+    linalg.yield %6 : f32
+  } -> tensor<896x448xf32>
+  util.return %2 : tensor<896x448xf32>
+}
+
+// CHECK-LABEL: @strided_from_filter_static(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<896x118x16xbf16>
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<448x59x16xbf16>
+// CHECK: %[[SLICE:.*]] = tensor.extract_slice %[[INPUT]][0, 0, 0] [896, 59, 16] [1, 2, 1]
+// CHECK-SAME:      tensor<896x118x16xbf16> to tensor<896x59x16xbf16>
+// CHECK: %[[GEN:.*]] = linalg.generic
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+// CHECK-SAME:      affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+// CHECK-SAME:      ins(%[[SLICE]], %[[FILTER]]
+// CHECK: util.return %[[GEN]]
diff --git a/compiler/src/iree/compiler/Preprocessing/Passes.cpp b/compiler/src/iree/compiler/Preprocessing/Passes.cpp
@@ -147,6 +147,8 @@ buildMakeSingleDispatchPassPipeline(OpPassManager &passManager,
   // Generalize transposes and any other remaining named linalg ops that can
   // now be represented as generics.
   passManager.addPass(GlobalOptimization::createGeneralizeLinalgNamedOpsPass());
+  passManager.addPass(
+      GlobalOptimization::createConvertStridedContractionToContractionPass());
   passManager.addPass(DispatchCreation::createFusionPreprocessingPass());
   passManager.addPass(mlir::createCSEPass());
   DispatchCreation::BubbleUpExpandShapesPassOptions bubbleOptions;