[Codegen] Add pattern to bubble bitcast past extract_slice (iree-org#21518)

qedawkins · keshavvinayak01 · commit a7bab8c286bc · 2025-09-04T18:04:04.000Z
It's easiest to handle bitcasts if we're able to fold them into input
bindings. Since we'll want an analogous pattern to "fuse" bitcasts when
they aren't foldable (and do them at the vector level) this is only run
when propagating reshapes by expansion earlier on.

Signed-off-by: keshavvinayak01 &lt;keshavvinayakjha@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/PropagateReshapesByExpansion.cpp b/compiler/src/iree/compiler/Codegen/Common/PropagateReshapesByExpansion.cpp
@@ -11,6 +11,9 @@
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir::iree_compiler {
@@ -294,6 +297,76 @@ struct ExpandDestinationForallOp final
   }
 };
 
+/// This pattern exchanges bitcast(extract_slice) to extract_slice(bitcast) in
+/// an attempt to move the bitcast closer to the loads. There is a related
+/// pattern that does the reverse when folding the bitcast is not possible and
+/// should be applied later.
+struct SwapInnerBitcastWithExtractSlice
+    : OpRewritePattern<IREE::TensorExt::BitCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(IREE::TensorExt::BitCastOp bitcastOp,
+                                PatternRewriter &rewriter) const override {
+    Value bitcastSrc = bitcastOp.getSource();
+    auto sliceOp = bitcastSrc.getDefiningOp<tensor::ExtractSliceOp>();
+    if (!sliceOp) {
+      return rewriter.notifyMatchFailure(bitcastOp, "non-slice producer");
+    }
+
+    auto bitcastSrcType = cast<RankedTensorType>(bitcastSrc.getType());
+    auto bitcastResType = cast<RankedTensorType>(bitcastOp.getType());
+
+    // Verify that only the inner most dimension is changed by the bitcast by
+    // comparing dynamic and static sizes for equality.
+    if (bitcastOp.getSourceDims() != bitcastOp.getResultDims() ||
+        bitcastSrcType.getShape().drop_back() !=
+            bitcastResType.getShape().drop_back() ||
+        ShapedType::isDynamic(bitcastSrcType.getShape().back())) {
+      return rewriter.notifyMatchFailure(
+          bitcastOp, "bitcast affects more than inner most dim");
+    }
+
+    // Fail if the inner most dim is sliced or if this is an encoded tensor.
+    RankedTensorType sliceInputType = sliceOp.getSource().getType();
+    if (sliceInputType.getEncoding() ||
+        sliceInputType.getRank() != bitcastSrcType.getRank() ||
+        sliceInputType.getShape().back() != bitcastSrcType.getShape().back()) {
+      return rewriter.notifyMatchFailure(
+          bitcastOp,
+          "inner dimension is sliced or rank reducing or tensor is encoded");
+    }
+
+    int64_t newInnerSize = bitcastResType.getShape().back();
+    SmallVector<int64_t> newBitcastShape(sliceInputType.getShape());
+    newBitcastShape.back() = newInnerSize;
+
+    auto newBitcastType =
+        RankedTensorType::get(newBitcastShape, bitcastResType.getElementType());
+
+    // Get the dynamic sizes of the slice source. Extracting a slice can remove
+    // dynamic dimensions or introduce new ones, so a new list of sizes is
+    // needed.
+    SmallVector<OpFoldResult> newMixedSizes =
+        tensor::getMixedSizes(rewriter, sliceOp.getLoc(), sliceOp.getSource());
+    SmallVector<Value> sliceSourceDynamicSizes;
+    SmallVector<int64_t> sliceSourceStaticSizes;
+    dispatchIndexOpFoldResults(newMixedSizes, sliceSourceDynamicSizes,
+                               sliceSourceStaticSizes);
+
+    Value newBitcast = rewriter.create<IREE::TensorExt::BitCastOp>(
+        bitcastOp.getLoc(), newBitcastType, sliceOp.getSource(),
+        sliceSourceDynamicSizes, sliceSourceDynamicSizes);
+    SmallVector<int64_t> newSizes(sliceOp.getStaticSizes());
+    newSizes.back() = newInnerSize;
+    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+        bitcastOp, bitcastResType, newBitcast, sliceOp.getOffsets(),
+        sliceOp.getSizes(), sliceOp.getStrides(), sliceOp.getStaticOffsets(),
+        newSizes, sliceOp.getStaticStrides());
+
+    return success();
+  }
+};
+
 struct PropagateReshapesByExpansionPass final
     : impl::PropagateReshapesByExpansionPassBase<
           PropagateReshapesByExpansionPass> {
@@ -341,7 +414,9 @@ void PropagateReshapesByExpansionPass::runOnOperation() {
   tensor::ExpandShapeOp::getCanonicalizationPatterns(bubbleExpandShapePatterns,
                                                      context);
   populateReshapeToInterfaceTensorPatterns(bubbleExpandShapePatterns);
-  bubbleExpandShapePatterns.add<ExpandDestinationForallOp>(context);
+  bubbleExpandShapePatterns
+      .add<ExpandDestinationForallOp, SwapInnerBitcastWithExtractSlice>(
+          context);
 
   if (failed(applyPatternsGreedily(getOperation(),
                                    std::move(bubbleExpandShapePatterns)))) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/propagate_reshapes_by_expansion.mlir b/compiler/src/iree/compiler/Codegen/Common/test/propagate_reshapes_by_expansion.mlir
@@ -341,3 +341,87 @@ func.func @expand_dest_forall_no_crash_issue_20736(%arg0: tensor<16x8x48x32x3x96
 //       CHECK:   scf.forall
 //   CHECK-NOT:     tensor.collapse_shape
 //       CHECK:     tensor.parallel_insert_slice
+
+// -----
+
+func.func @swap_inner_bitcast(%arg0: tensor<3x6xi8>) -> tensor<2x3xi16> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [2, 6] [1, 1] : tensor<3x6xi8> to tensor<2x6xi8>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<2x6xi8> -> tensor<2x3xi16>
+  return %1 : tensor<2x3xi16>
+}
+
+// CHECK-LABEL: @swap_inner_bitcast
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8>
+// CHECK-NEXT: %[[BITCAST:.+]] = iree_tensor_ext.bitcast %[[ARG0]] : tensor<3x6xi8> -> tensor<3x3xi16>
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[BITCAST]]{{.*}} : tensor<3x3xi16> to tensor<2x3xi16>
+// CHECK-NEXT: return %[[SLICE]]
+
+// -----
+
+func.func @no_swap_arbitrary_bitcast(%arg0: tensor<3x6xi8>) -> tensor<6xi16> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [2, 6] [1, 1] : tensor<3x6xi8> to tensor<2x6xi8>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<2x6xi8> -> tensor<6xi16>
+  return %1 : tensor<6xi16>
+}
+
+// CHECK-LABEL: @no_swap_arbitrary_bitcast
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8>
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-NEXT: %[[BITCAST:.+]] = iree_tensor_ext.bitcast %[[SLICE]]
+// CHECK-NEXT: return %[[BITCAST]]
+
+// -----
+
+func.func @swap_inner_bitcast_dynamic_source(%arg0: tensor<?x6xi8>) -> tensor<2x3xi16> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [2, 6] [1, 1] : tensor<?x6xi8> to tensor<2x6xi8>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<2x6xi8> -> tensor<2x3xi16>
+  return %1 : tensor<2x3xi16>
+}
+
+// CHECK-LABEL: @swap_inner_bitcast_dynamic_source
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<?x6xi8>
+// CHECK:      %[[DIM:.+]] = tensor.dim %[[ARG0]], %c0 : tensor<?x6xi8>
+// CHECK-NEXT: %[[BITCAST:.+]] = iree_tensor_ext.bitcast %[[ARG0]] : tensor<?x6xi8>{%[[DIM]]} -> tensor<?x3xi16>{%[[DIM]]}
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[BITCAST]]{{.*}} : tensor<?x3xi16> to tensor<2x3xi16>
+// CHECK-NEXT: return %[[SLICE]]
+
+// -----
+
+func.func @swap_inner_bitcast_dynamic_result(%arg0: tensor<3x6xi8>, %arg1: index) -> tensor<?x3xi16> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [%arg1, 6] [1, 1] : tensor<3x6xi8> to tensor<?x6xi8>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<?x6xi8>{%arg1} -> tensor<?x3xi16>{%arg1}
+  return %1 : tensor<?x3xi16>
+}
+
+// CHECK-LABEL: @swap_inner_bitcast_dynamic_result
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8>
+// CHECK-SAME: %[[ARG1:[A-Za-z0-9]+]]: index
+// CHECK-NEXT: %[[BITCAST:.+]] = iree_tensor_ext.bitcast %[[ARG0]] : tensor<3x6xi8> -> tensor<3x3xi16>
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[BITCAST]]{{.*}} : tensor<3x3xi16> to tensor<?x3xi16>
+// CHECK-NEXT: return %[[SLICE]]
+
+// -----
+
+func.func @no_swap_encoded_bitcast(%arg0: tensor<3x6xi8, 1>) -> tensor<2x3xi16, 1> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [2, 6] [1, 1] : tensor<3x6xi8, 1> to tensor<2x6xi8, 1>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<2x6xi8, 1> -> tensor<2x3xi16, 1>
+  return %1 : tensor<2x3xi16, 1>
+}
+
+// CHECK-LABEL: @no_swap_encoded_bitcast
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8, 1 : i64>
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-NEXT: iree_tensor_ext.bitcast %[[SLICE]]
+
+// -----
+
+func.func @no_swap_rank_reducing_slice(%arg0: tensor<3x6xi8>) -> tensor<3xi16> {
+  %0 = tensor.extract_slice %arg0 [0, 0] [1, 6] [1, 1] : tensor<3x6xi8> to tensor<6xi8>
+  %1 = iree_tensor_ext.bitcast %0 : tensor<6xi8> -> tensor<3xi16>
+  return %1 : tensor<3xi16>
+}
+
+// CHECK-LABEL: @no_swap_rank_reducing_slice
+// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8>
+// CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-NEXT: iree_tensor_ext.bitcast %[[SLICE]]