[GPU] Set insertion point to last slice index operand in reshape and slice fusion (#19959)

Max191 · web-flow · commit d3cfe11c4a16 · 2025-02-12T17:45:29.000Z
Empty tensor elimination relies on dominance of SSA values when
attempting to reuse buffers for slices of init operands. Ideally, empty
tensor elimination should be able to handle this, but it is difficult to
fix at that level. For now, this PR tries to avoid creating these
dominance issues in the first place.

---------

Signed-off-by: Max Dawkins &lt;max.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_collapse_shape_with_forall.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/transform_fuse_collapse_shape_with_forall.mlir
@@ -98,13 +98,13 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:     {{\[}}[0], [1, 2]] output_shape [%[[SIZE0]], %[[SIZE1]], 8] : tensor<?x?xf32> into tensor<?x?x8xf32>
 //   CHECK-DAG:   %[[SLICE_SIZE_0:.+]] = affine.min #map(%[[IDX0]])[%[[SIZE0]]]
 //   CHECK-DAG:   %[[SLICE_SIZE_1:.+]] = affine.min #map(%[[IDX1]])[%[[SIZE1]]]
+//   CHECK-DAG:   %[[LINEAR_SLICE_IDX:.+]] = affine.linearize_index disjoint [%[[IDX1]], %[[C0]]] by (%[[SIZE1]], 8) : index
+//   CHECK-DAG:   %[[COLLAPSED_SLICE_SIZE:.+]] = affine.apply #[[$MAP1]](%[[SLICE_SIZE_1]])
 //   CHECK-DAG:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]]
 //  CHECK-SAME:     [%[[IDX0]], %[[IDX1]], 0]{{.*}}[%[[SLICE_SIZE_0]], %[[SLICE_SIZE_1]], 8] [1, 1, 1] : tensor<?x?x8xf32> to tensor<?x?x8xf32>
 //   CHECK-DAG:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[EXPANDED_BBARG]]
 //  CHECK-SAME:     [%[[IDX0]], %[[IDX1]], 0] [%[[SLICE_SIZE_0]], %[[SLICE_SIZE_1]], 8] [1, 1, 1] : tensor<?x?x8xf32> to tensor<?x?x8xf32>
 //   CHECK-DAG:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<?x?x8xf32>) outs(%[[OUT_SLICE]] : tensor<?x?x8xf32>) -> tensor<?x?x8xf32>
-//   CHECK-DAG:   %[[LINEAR_SLICE_IDX:.+]] = affine.linearize_index disjoint [%[[IDX1]], %[[C0]]] by (%[[SIZE1]], 8) : index
-//   CHECK-DAG:   %[[COLLAPSED_SLICE_SIZE:.+]] = affine.apply #[[$MAP1]](%[[SLICE_SIZE_1]])
 //   CHECK-DAG:   %[[COLLAPSED_COPY:.+]] = tensor.collapse_shape %[[COPY]] {{\[}}[0], [1, 2]] : tensor<?x?x8xf32> into tensor<?x?xf32>
 //       CHECK:     scf.forall.in_parallel {
 //       CHECK:       tensor.parallel_insert_slice %[[COLLAPSED_COPY]] into %[[COLLAPSED_BBARG]]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -354,6 +354,43 @@ collapsibleSlicePrecondition(RewriterBase &rewriter,
   return success();
 }
 
+/// Given a tensor.parallel_insert_slice op, find all values that are needed to
+/// build an equivalent subset extract_slice, and set the insertion point to the
+/// last of these values. This helper is useful in cases where additional index
+/// computation must be composed with the current indexing operations for the
+/// slice, since we want all index operations for the slice to retain the same
+/// level of dominance after composing the new computation.
+static Operation *
+setInsertionPointAfterLastIndexOperand(RewriterBase &rewriter,
+                                       tensor::ParallelInsertSliceOp op) {
+  DominanceInfo domInfo;
+  auto subsetOp = cast<SubsetInsertionOpInterface>(op.getOperation());
+  SmallVector<Value> values = subsetOp.getValuesNeededToBuildSubsetExtraction();
+  Operation *lastOp = nullptr;
+  bool setInsertionPointBefore = false;
+  for (auto val : values) {
+    auto definingOp = val.getDefiningOp();
+    if (!definingOp) {
+      definingOp =
+          &cast<BlockArgument>(val).getOwner()->getOperations().front();
+    }
+    if (!definingOp || (lastOp && domInfo.dominates(definingOp, lastOp)))
+      continue;
+    lastOp = definingOp;
+
+    // For block arguments we want the insertion point to be at the start of
+    // the block, so we need to set the insertion point before the first op
+    // in the block.
+    setInsertionPointBefore = isa<BlockArgument>(val);
+  }
+  if (setInsertionPointBefore) {
+    rewriter.setInsertionPoint(lastOp);
+  } else {
+    rewriter.setInsertionPointAfter(lastOp);
+  }
+  return lastOp;
+}
+
 /// Collapse all `ops` with the given `reassociations`. All `ops` are expected
 /// to have equivalent offsets, sizes, and strides. All strides are expected to
 /// be 1. This function assumes that the parallelInsertOp passes the
@@ -363,8 +400,9 @@ collapseParallelInsertOp(RewriterBase &rewriter,
                          tensor::ParallelInsertSliceOp parallelInsertOp,
                          SmallVector<ReassociationIndices> reassociations) {
   // Compute the collapsed offsets, sizes, and strides.
-  rewriter.setInsertionPoint(parallelInsertOp.getParallelCombiningParent());
-  Location loc = parallelInsertOp->getLoc();
+  Operation *lastOp =
+      setInsertionPointAfterLastIndexOperand(rewriter, parallelInsertOp);
+  Location loc = lastOp->getLoc();
   int64_t resultIdx = parallelInsertOp.getTiedOpResult().getResultNumber();
   auto forallOp = parallelInsertOp->getParentOfType<scf::ForallOp>();
   Value loopInit = forallOp.getOutputs()[resultIdx];
@@ -555,8 +593,9 @@ clampParallelInsertSliceOp(RewriterBase &rewriter,
                            tensor::ParallelInsertSliceOp parallelInsertOp,
                            SmallVector<OpFoldResult> upperBoundSizes) {
   OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(parallelInsertOp.getParallelCombiningParent());
-  Location loc = parallelInsertOp.getParallelCombiningParent()->getLoc();
+  Operation *lastOp =
+      setInsertionPointAfterLastIndexOperand(rewriter, parallelInsertOp);
+  Location loc = lastOp->getLoc();
 
   // Clamp the parallel_insert_slice sizes to fit within the full result tensor.
   SmallVector<OpFoldResult> offsets = parallelInsertOp.getMixedOffsets();