[Dispatch Creation] Fix GatherFusionPattern crash (#20887)

IanWood1 · web-flow · commit 3470dbb17d44 · 2025-05-22T12:45:24.000-07:00
Fixes crash when one of the producer's maps is a projected permutation
and fixes bug by replacing `linalg.index` ops in the producer with the
operands of the `tensor.extract`

Signed-off-by: Ian Wood &lt;ianwood2024@u.northwestern.edu&gt;
diff --git a/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp b/compiler/src/iree/compiler/DispatchCreation/ElementwiseOpFusion.cpp
@@ -29,12 +29,23 @@ namespace mlir::iree_compiler::DispatchCreation {
 #include "iree/compiler/DispatchCreation/Passes.h.inc"
 
 namespace {
-
 struct ElementwiseOpFusionPass final
     : public impl::ElementwiseOpFusionPassBase<ElementwiseOpFusionPass> {
   using Base::Base;
   void runOnOperation() override;
 };
+} // namespace
+
+template <typename T>
+static SmallVector<T> applyProjectedPermutation(const SmallVectorImpl<T> &input,
+                                                ArrayRef<int64_t> projPerm) {
+  SmallVector<T> result;
+  result.reserve(projPerm.size());
+  for (int64_t idx : projPerm) {
+    result.push_back(input[idx]);
+  }
+  return result;
+}
 
 //===----------------------------------------------------------------------===//
 // GatherFusionPattern
@@ -44,6 +55,7 @@ struct ElementwiseOpFusionPass final
 // cannot be fused because it there is no producer-consumer
 // relationship between the two generics. This is because the indexing
 // is not affine (index values come from a tensor).
+namespace {
 struct GatherFusionPattern final : public OpRewritePattern<tensor::ExtractOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(tensor::ExtractOp extractOp,
@@ -83,14 +95,23 @@ struct GatherFusionPattern final : public OpRewritePattern<tensor::ExtractOp> {
             return cast<AffineDimExpr>(expr).getPosition();
           });
       SmallVector<Value, 4> indices = extractOp.getIndices();
-      indices = applyPermutation(indices, perm);
+      indices = applyProjectedPermutation(indices, perm);
       auto newExtract = rewriter.create<tensor::ExtractOp>(
           extractOp.getLoc(), operand.get(), indices);
       extractOps.push_back(newExtract);
     }
     rewriter.cloneRegionBefore(producerOp.getRegion(), consumerOp.getRegion(),
                                consumerOp.getRegion().begin());
     Block &clonedBlock = consumerOp.getRegion().front();
+
+    // Replace `linalg.index` ops with the value of the index from `indices`.
+    SmallVector<Value, 4> indices = extractOp.getIndices();
+    indices = applyPermutationMap(resultMap, ArrayRef(indices));
+    SmallVector<linalg::IndexOp> indexOps(
+        clonedBlock.getOps<linalg::IndexOp>());
+    for (linalg::IndexOp indexOp : indexOps) {
+      rewriter.replaceOp(indexOp, indices[indexOp.getDim()]);
+    }
     auto producerTermOp = clonedBlock.getTerminator();
 
     rewriter.inlineBlockBefore(&clonedBlock, extractOp->getNextNode(),
@@ -105,7 +126,6 @@ struct GatherFusionPattern final : public OpRewritePattern<tensor::ExtractOp> {
     return success();
   }
 };
-
 } // namespace
 
 void ElementwiseOpFusionPass::runOnOperation() {
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir b/compiler/src/iree/compiler/DispatchCreation/test/elementwise_op_fusion.mlir
@@ -27,7 +27,6 @@ util.func public @transpose_attention(%arg0: tensor<4x64x32x128xf16>, %arg1: ten
   %collapsed = tensor.collapse_shape %7 [[0], [1], [2, 3]] : tensor<4x64x32x128xf16> into tensor<4x64x4096xf16>
   util.return %collapsed : tensor<4x64x4096xf16>
 }
-
 // CHECK-LABEL: util.func public @transpose_attention
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -76,7 +75,6 @@ util.func public @transposed_attention_masked(%arg0: tensor<4x64x32x128xf16>, %a
   %collapsed = tensor.collapse_shape %8 [[0], [1], [2, 3]] : tensor<4x64x32x128xf16> into tensor<4x64x4096xf16>
   util.return %collapsed : tensor<4x64x4096xf16>
 }
-
 // CHECK-LABEL: util.func public @transposed_attention_masked
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -115,7 +113,6 @@ util.func public @transpose_matmul(%arg0 : tensor<100x100xf16>, %arg1 : tensor<1
   } -> tensor<100x100xf16>
   util.return  %4 : tensor<100x100xf16>
 }
-
 // CHECK-LABEL: util.func public @transpose_matmul
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -156,7 +153,6 @@ util.func public @fuse_generic_gather(
       } -> tensor<4x?x4096xf32>
   util.return %16 : tensor<4x?x4096xf32>
 }
-
 // CHECK:         %[[INDEX0:[a-zA-Z0-9]+]] = arith.index_cast %in : i64 to index
 // CHECK:         %[[INDEX1:[a-zA-Z0-9]+]] = linalg.index 2 : index
 // CHECK-NEXT:    %[[EXTRACTED:.*]] = tensor.extract %[[TENSOR0:.+]][%[[INDEX0]], %[[INDEX1]]] : tensor<128256x4096xf16>
@@ -198,7 +194,6 @@ util.func public @fuse_generic_gather2(
       } -> tensor<4x?x4096xf32>
   util.return %16 : tensor<4x?x4096xf32>
 }
-
 // CHECK:         %[[INDEX0:[a-zA-Z0-9]+]] = arith.index_cast %in : i64 to index
 // CHECK:         %[[INDEX1:[a-zA-Z0-9]+]] = linalg.index 2 : index
 // CHECK-NEXT:    %[[EXTRACTED:.*]] = tensor.extract %[[TENSOR0:.+]][%[[INDEX0]], %[[INDEX1]]] : tensor<128256x4096xf16>
@@ -237,7 +232,6 @@ util.func public @fuse_transpose_attention_to_producer(%q: tensor<2x10x4096x64xf
   } -> tensor<2x10x4096x64xf16>
   util.return %attention : tensor<2x10x4096x64xf16>
 }
-
 // CHECK-LABEL: util.func public @fuse_transpose_attention_to_producer
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -274,7 +268,6 @@ util.func public @fuse_attention_with_broadcast(%arg0: tensor<4x8x128x?xf16>, %a
   } -> tensor<4x8x4x?x32x128xf16>
   util.return %1 : tensor<4x8x4x?x32x128xf16>
 }
-
 // CHECK-LABEL: func public @fuse_attention_with_broadcast
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]:
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]:
@@ -305,7 +298,6 @@ util.func public @fuse_attention_with_broadcast_transpose(%arg0: tensor<4x?x8x12
   } -> tensor<4x8x4x?x32x128xf16>
   util.return %1 : tensor<4x8x4x?x32x128xf16>
 }
-
 // CHECK-LABEL: func public @fuse_attention_with_broadcast_transpose
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]:
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]:
@@ -356,7 +348,6 @@ util.func public @gather_fusion(%arg0: tensor<2x64x64x640xf16>, %arg1: tensor<2x
   } -> tensor<2x128x128x640xi8>
   util.return %3 : tensor<2x128x128x640xi8>
 }
-
 // CHECK-LABEL: util.func public @gather_fusion(
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -423,7 +414,6 @@ util.func public @gather_fusion_compose_maps(%arg0: tensor<2x64x64x640xf16>, %ar
   } -> tensor<2x128x128x640xi8>
   util.return %3 : tensor<2x128x128x640xi8>
 }
-
 // CHECK-LABEL: util.func public @gather_fusion_compose_maps(
 //  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
 //  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
@@ -455,3 +445,94 @@ util.func public @gather_fusion_compose_maps(%arg0: tensor<2x64x64x640xf16>, %ar
 //       CHECK:     %[[EXTRACT1:.*]] = tensor.extract %[[ARG1]][%[[CAST0]], %[[CAST3]], %[[CAST2]], %[[CAST1]]] : tensor<2x64x64x640xf16>
 //       CHECK:     %[[ADDF:.+]] = arith.addf %[[EXTRACT0]], %[[EXTRACT1]] : f16
 //       CHECK:   util.return %[[GEN]] : tensor<2x128x128x640xi8>
+
+// -----
+
+util.func public @gather_0d_producer(%arg0 : tensor<f16>, %arg1 : tensor<100xindex>, %arg2 : tensor<256xf16>) -> (tensor<100xf32>) {
+  %empty0 = tensor.empty() : tensor<256xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg2 : tensor<f16>, tensor<256xf16>) outs(%empty0 : tensor<256xf32>) {
+  ^bb0(%in: f16, %in0 : f16, %out: f32):
+    %0 = arith.extf %in : f16 to f32
+    %1 = arith.extf %in0 : f16 to f32
+    %2 = arith.addf %0, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<256xf32>
+  %empty1 = tensor.empty() : tensor<100xf32>
+  %gather = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg1: tensor<100xindex>) outs(%empty1 : tensor<100xf32>) {
+  ^bb0(%in: index, %out: f32):
+    %1 = tensor.extract %0[%in] : tensor<256xf32>
+    linalg.yield %1 : f32
+  } -> tensor<100xf32>
+  util.return %gather : tensor<100xf32>
+}
+// CHECK-LABEL: util.func public @gather_0d_producer(
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG2:[A-Za-z0-9]+]]: tensor
+//       CHECK:   %[[GATHER:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[ARG1]] : tensor<100xindex>
+//  CHECK-NEXT:     ^bb0(%[[IN:.+]]: index
+//   CHECK-DAG:     %[[EXTRACT0:.+]] = tensor.extract %[[ARG0]][]
+//   CHECK-DAG:     %[[EXTRACT1:.+]] = tensor.extract %[[ARG2]][%[[IN]]]
+//       CHECK:   return %[[GATHER]]
+
+// -----
+
+util.func public @gather_replace_linalg_index(%arg0 : tensor<256x256xf16>, %arg1 : tensor<100xindex>) -> (tensor<100xf32>) {
+  %empty0 = tensor.empty() : tensor<256x256xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<256x256xf16>) outs(%empty0 : tensor<256x256xf32>) {
+  ^bb0(%in: f16, %out: f32):
+    %0 = arith.extf %in : f16 to f32
+    %1 = linalg.index 1 : index
+    %2 = arith.index_cast %1 : index to i32
+    %3 = arith.uitofp %2 : i32 to f32
+    %4 = arith.addf %0, %3 : f32
+    linalg.yield %4 : f32
+  } -> tensor<256x256xf32>
+  %empty1 = tensor.empty() : tensor<100xf32>
+  %gather = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg1: tensor<100xindex>) outs(%empty1 : tensor<100xf32>) {
+  ^bb0(%in: index, %out: f32):
+    %cst0 = arith.constant 0 : index
+    %1 = tensor.extract %0[%cst0, %in] : tensor<256x256xf32>
+    linalg.yield %1 : f32
+  } -> tensor<100xf32>
+  util.return %gather : tensor<100xf32>
+}
+// CHECK-LABEL: util.func public @gather_replace_linalg_index(
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
+//       CHECK:   %[[GATHER:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[ARG1]] : tensor<100xindex>
+//  CHECK-NEXT:     ^bb0(%[[IN:.+]]: index
+//       CHECK:     arith.index_cast %[[IN]]
+//       CHECK:   return %[[GATHER]]
+
+// -----
+
+util.func public @gather_replace_linalg_index_transpose(%arg0 : tensor<256x256xf16>, %arg1 : tensor<100xindex>, %arg2 : index) -> (tensor<100xf32>) {
+  %empty0 = tensor.empty() : tensor<256x256xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<256x256xf16>) outs(%empty0 : tensor<256x256xf32>) {
+  ^bb0(%in: f16, %out: f32):
+    %0 = arith.extf %in : f16 to f32
+    %1 = linalg.index 1 : index
+    %2 = arith.index_cast %1 : index to i32
+    %3 = arith.uitofp %2 : i32 to f32
+    %4 = arith.addf %0, %3 : f32
+    linalg.yield %4 : f32
+  } -> tensor<256x256xf32>
+  %empty1 = tensor.empty() : tensor<100xf32>
+  %gather = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg1: tensor<100xindex>) outs(%empty1 : tensor<100xf32>) {
+  ^bb0(%in: index, %out: f32):
+    %1 = tensor.extract %0[%arg2, %in] : tensor<256x256xf32>
+    linalg.yield %1 : f32
+  } -> tensor<100xf32>
+  util.return %gather : tensor<100xf32>
+}
+// CHECK-LABEL: util.func public @gather_replace_linalg_index_transpose(
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: tensor
+//  CHECK-SAME:   %[[ARG2:[A-Za-z0-9]+]]: index
+//       CHECK:   %[[GATHER:.+]] = linalg.generic
+//  CHECK-SAME:     ins(%[[ARG1]] : tensor<100xindex>
+//       CHECK:     arith.index_cast %[[ARG2]]
+//       CHECK:   return %[[GATHER]]