[GlobalOpt] Fix transpose propagation for index-semantic ops by interchanging indexing maps (#22248)

ziliangzl · web-flow · commit 2b30f25115cb · 2025-10-21T03:45:54.000Z
Index-semantic ops were previously treated as elementwise in the
`SinkTransposeThroughUnaryElementwiseInput` and
`BubbleTransposeThroughUnaryElementwiseDpsInit` patterns, which could
not correctly update the indexing maps.
After this change, ops with index semantics will no longer be
incorrectly handled by these patterns. Instead, they will be processed
by the `FuseTransposeWithProducerLinalgOp` pattern, which uses
`linalg::interchangeGenericOp`. This function already handles
index-semantic ops.

---------

Signed-off-by: Ziliang Zhang &lt;zzl.coding@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp b/compiler/src/iree/compiler/GlobalOptimization/PropagateLinalgTranspose.cpp
@@ -761,6 +761,10 @@ class SinkTransposeThroughUnaryElementwiseInput
       return rewriter.notifyMatchFailure(genericOp, "non-elementwise generic");
     }
 
+    if (genericOp.hasIndexSemantics()) {
+      return rewriter.notifyMatchFailure(genericOp, "has index semantics");
+    }
+
     if (genericOp.getNumDpsInits() != 1) {
       return rewriter.notifyMatchFailure(genericOp,
                                          "unimplemented: multiple results");
@@ -865,6 +869,10 @@ class BubbleTransposeThroughUnaryElementwiseDpsInit
       return rewriter.notifyMatchFailure(transposeOp, "not elementwise");
     }
 
+    if (genericOp.hasIndexSemantics()) {
+      return rewriter.notifyMatchFailure(genericOp, "has index semantics");
+    }
+
     if (!genericOp->hasOneUse()) {
       return rewriter.notifyMatchFailure(transposeOp, "not single user");
     }
@@ -898,9 +906,9 @@ class BubbleTransposeThroughUnaryElementwiseDpsInit
     SmallVector<AffineMap> indexingMaps = getTransposedIndexingMaps(
         genericOp, inputOperand->getOperandNumber(), transposeMap);
 
-    // We do not need to update indexing maps because this is a unary
-    // elementwise op where the input and output maps are the same. Just
-    // replace the operands with transposed variants.
+    // We do not need to update indexing maps because this is an elementwise
+    // op where the input and output maps are the same.
+    // Just replace the operands with transposed variants.
     auto newGenericOp =
         mlir::clone(rewriter, genericOp, newInit.getType(), newOperands);
     newGenericOp.setIndexingMapsAttr(
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
@@ -254,6 +254,46 @@ util.func public @do_not_propagate_to_matmul_in_dispatch(%lhs: tensor<16x16xf32>
 
 // -----
 
+util.func public @propagate_to_gather_like_ops(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<1xi16>) -> tensor<2x3x4x5xf32> {
+  %cst = arith.constant 0xFF800000 : f32
+  %empty_transposed = tensor.empty() : tensor<2x4x5x3xf32>
+  %transposed = linalg.transpose ins(%arg0 : tensor<2x3x4x5xf32>) outs(%empty_transposed : tensor<2x4x5x3xf32>) permutation = [0, 2, 3, 1]
+  %empty = tensor.empty() : tensor<2x4x5x3xf32>
+  %collapsed = tensor.collapse_shape %arg1 [] : tensor<1xi16> into tensor<i16>
+  %mask = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed, %collapsed : tensor<2x4x5x3xf32>, tensor<i16>) outs(%empty : tensor<2x4x5x3xf32>) {
+  ^bb0(%in: f32, %in_0: i16, %out: f32):
+    %11 = linalg.index 3 : index
+    %12 = arith.index_cast %in_0 : i16 to index
+    %13 = arith.cmpi ult, %11, %12 : index
+    %14 = arith.select %13, %in, %cst : f32
+    linalg.yield %14 : f32
+  } -> tensor<2x4x5x3xf32>
+  %empty_transposed_0 = tensor.empty() : tensor<2x3x4x5xf32>
+  %transposed_0 = linalg.transpose ins(%mask : tensor<2x4x5x3xf32>) outs(%empty_transposed_0 : tensor<2x3x4x5xf32>) permutation = [0, 3, 1, 2]
+  util.return %transposed_0 : tensor<2x3x4x5xf32>
+}
+
+//   CHECK-DAG: #[[$MAP_0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+//   CHECK-DAG: #[[$MAP_1:.+]] = affine_map<(d0, d1, d2, d3) -> ()>
+// CHECK-LABEL:   util.func public @propagate_to_gather_like_ops(
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<2x3x4x5xf32>,
+//  CHECK-SAME:     %[[ARG1:.*]]: tensor<1xi16>) -> tensor<2x3x4x5xf32> {
+//       CHECK:     %[[VAL_0:.*]] = arith.constant 0xFF800000 : f32
+//       CHECK:     %[[VAL_1:.*]] = tensor.collapse_shape %[[ARG1]] [] : tensor<1xi16> into tensor<i16>
+//       CHECK:     %[[VAL_2:.*]] = tensor.empty() : tensor<2x3x4x5xf32>
+//       CHECK:     %[[VAL_3:.*]] = linalg.generic {indexing_maps = [#[[$MAP_0]], #[[$MAP_1]], #[[$MAP_0]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[ARG0]], %[[VAL_1]] : tensor<2x3x4x5xf32>, tensor<i16>) outs(%[[VAL_2]] : tensor<2x3x4x5xf32>) {
+//       CHECK:     ^bb0(%[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: i16, %[[VAL_6:.*]]: f32):
+//       CHECK:       %[[VAL_7:.*]] = linalg.index 1 : index
+//       CHECK:       %[[VAL_8:.*]] = arith.index_cast %[[VAL_5]] : i16 to index
+//       CHECK:       %[[VAL_9:.*]] = arith.cmpi ult, %[[VAL_7]], %[[VAL_8]] : index
+//       CHECK:       %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_4]], %[[VAL_0]] : f32
+//       CHECK:       linalg.yield %[[VAL_10]] : f32
+//       CHECK:     } -> tensor<2x3x4x5xf32>
+//       CHECK:     util.return %[[VAL_3]] : tensor<2x3x4x5xf32>
+//       CHECK:   }
+
+// -----
+
 util.func public @propagate_to_bmm_transpose_batch(%transposed_lhs: tensor<16x2x16xf32>,
                                             %rhs: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %empty = tensor.empty(): tensor<2x16x16xf32>