[mlir][Codegen] Remove workaround for handling consumer fusion along multiple operands.

MaheshRavishankar · MaheshRavishankar · commit 27735ade0c22 · 2025-06-23T23:12:06.000-07:00
With llvm/llvm-project#145193 it is possible to tile and fuse consumers when the consumer uses multiple results of the tiled loop (as long the as the slices of the uses/operands are consistent w.r.t to their use in the consumer). This removes the need for the workaround that was added to handle such cases and generalizes the cases of consumer fusion that can be handled. Fixes iree-org#21087 Signed-off-by: MaheshRavishankar <mahesh.ravishankar@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/TileAndFuseUtils.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 
 #define DEBUG_TYPE "iree-codegen-common-tile-and-fuse-utils"
@@ -46,36 +47,6 @@ void fuseProducersOfSlices(RewriterBase &rewriter,
   }
 }
 
-bool warForConsumerFusionSSAViolation(
-    Operation *rootOp,
-    const llvm::SmallDenseSet<Operation *> &tiledAndFusedOps) {
-  auto linalgRootOp = dyn_cast<linalg::LinalgOp>(rootOp);
-  if (!linalgRootOp) {
-    return false;
-  }
-  SmallVector<utils::IteratorType> iteratorTypes =
-      linalgRootOp.getIteratorTypesArray();
-  for (AffineMap map :
-       llvm::map_range(linalgRootOp.getIndexingMaps(), [](Attribute attr) {
-         return cast<AffineMapAttr>(attr).getValue();
-       })) {
-    if (!compressUnusedDims(map).isIdentity()) {
-      return false;
-    }
-  }
-
-  for (OpOperand &use : linalgRootOp->getUses()) {
-    auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
-    if (!linalgUser) {
-      return false;
-    }
-    if (!linalgUser.getMatchingIndexingMap(&use).isIdentity()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 void collectTiledAndFusedOps(Operation *rootOp,
                              llvm::SmallDenseSet<Operation *> &result) {
   SmallVector<Operation *> worklist;
@@ -108,56 +79,71 @@ void collectTiledAndFusedOps(Operation *rootOp,
 FailureOr<std::queue<Operation *>>
 fuseConsumersIntoForall(RewriterBase &rewriter, Operation *tiledOp,
                         MutableArrayRef<LoopLikeOpInterface> loops,
-                        bool useWARForConsumerFusionSSAViolation) {
-  auto addCandidateSlices =
-      [](Operation *fusedOp,
-         std::queue<tensor::ParallelInsertSliceOp> &candidates) {
-        for (auto *userOp : fusedOp->getResults().getUsers()) {
-          if (auto sliceOp =
-                  llvm::dyn_cast<tensor::ParallelInsertSliceOp>(userOp)) {
-            candidates.push(sliceOp);
-          }
-        }
-      };
-
+                        std::function<bool(Operation *)> filterFn) {
   // Collect the candidate slices which can be potential consumers that can be
   // fused.
-  std::queue<tensor::ParallelInsertSliceOp> candidates;
-  addCandidateSlices(tiledOp, candidates);
+  std::queue<SmallVector<Operation *>> candidates;
+  DenseSet<tensor::ParallelInsertSliceOp> allCandidates;
+  auto addCandidateSlices = [&candidates, &allCandidates,
+                             &filterFn](Operation *fusedOp) {
+    for (auto *userOp : fusedOp->getResults().getUsers()) {
+      auto sliceOp = dyn_cast<tensor::ParallelInsertSliceOp>(userOp);
+      if (!sliceOp || allCandidates.contains(sliceOp)) {
+        continue;
+      }
+
+      auto currLoop =
+          cast<scf::ForallOp>(sliceOp->getParentOp()->getParentOp());
+      OpResult loopResult = currLoop.getTiedOpResult(
+          currLoop.getTiedOpOperand(cast<BlockArgument>(sliceOp.getDest())));
+      auto users = llvm::to_vector(
+          llvm::make_filter_range(loopResult.getUsers(), filterFn));
+      if (users.empty()) {
+        continue;
+      }
+      mlir::computeTopologicalSorting(users);
+
+      Operation *fusableUser = users.front();
+      // Check all operands from the `scf.forall`
+      SmallVector<OpResult> loopResults;
+      for (OpOperand &opOperand : fusableUser->getOpOperands()) {
+        if (opOperand.get().getDefiningOp() == currLoop.getOperation()) {
+          loopResults.push_back(cast<OpResult>(opOperand.get()));
+        }
+      }
+
+      SmallVector<Operation *> fusedSlices;
+      for (auto result : loopResults) {
+        BlockArgument tiedBlockArg =
+            currLoop.getTiedBlockArgument(currLoop.getTiedOpOperand(result));
+        SmallVector<tensor::ParallelInsertSliceOp> slices = llvm::map_to_vector(
+            currLoop.getCombiningOps(tiedBlockArg), [](Operation *op) {
+              return cast<tensor::ParallelInsertSliceOp>(op);
+            });
+        llvm::append_range(fusedSlices, slices);
+        allCandidates.insert_range(slices);
+      }
+      if (!fusedSlices.empty()) {
+        candidates.emplace(std::move(fusedSlices));
+      }
+    }
+  };
+
+  addCandidateSlices(tiledOp);
 
   std::queue<Operation *> newFusionOpportunities;
   while (!candidates.empty()) {
 
     // Traverse the slices in BFS fashion.
-    tensor::ParallelInsertSliceOp candidateSliceOp = candidates.front();
+    SmallVector<Operation *> candidateSlices = candidates.front();
     candidates.pop();
 
     FailureOr<scf::SCFFuseConsumerOfSliceResult> fusedResult =
-        mlir::scf::tileAndFuseConsumerOfSlices(
-            rewriter, candidateSliceOp.getOperation(), loops);
+        mlir::scf::tileAndFuseConsumerOfSlices(rewriter, candidateSlices,
+                                               loops);
     if (failed(fusedResult)) {
-      LLVM_DEBUG(llvm::dbgs() << "failed to fuse consumer of slice: "
-                              << candidateSliceOp << "\n");
-      continue;
-    }
-
-    // Implement the WAR for consumer fusion SSA violation (as described below
-    // in the comments for `warForConsumerFusionSSAViolation`)
-    if (useWARForConsumerFusionSSAViolation) {
-      for (auto [tiledOpResult, loopResult] :
-           llvm::zip(tiledOp->getResults(), loops.back()->getResults())) {
-        for (OpOperand &use : loopResult.getUses()) {
-          Operation *user = use.getOwner();
-          if (user->getParentOp() != loops.back()) {
-            continue;
-          }
-          auto slice = dyn_cast<tensor::ExtractSliceOp>(user);
-          if (!slice) {
-            return failure();
-          }
-          rewriter.replaceAllOpUsesWith(slice, tiledOpResult);
-        }
-      }
+      return candidateSlices.front()->emitOpError(
+          "failed to fuse consumer of slice");
     }
 
     // Replace the original consumer operation with the tiled implementation.
@@ -168,8 +154,7 @@ fuseConsumersIntoForall(RewriterBase &rewriter, Operation *tiledOp,
     // values produced by operations that implement the `TilingInterface`.
     // Add these operations to the worklist.
     addCandidateSlices(
-        fusedResult->tiledAndFusedConsumerOperands.front()->getOwner(),
-        candidates);
+        fusedResult->tiledAndFusedConsumerOperands.front()->getOwner());
 
     // Add the list of new producer fusion opportunities.
     for (auto tiledOp : fusedResult.value().tiledOps) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.h b/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.h
@@ -24,66 +24,6 @@ void fuseProducersOfSlices(RewriterBase &rewriter,
                            scf::SCFTileAndFuseOptions &options,
                            MutableArrayRef<LoopLikeOpInterface> loops);
 
-/// Consider the following case
-///
-/// ```mlir
-/// %0:2 = linalg.generic {
-///     indexing_maps = [....,
-///                      affine_map<(d0, d1, d2) -> (d0, d1),
-///                      affine_map<(d0, d1, d2) -> (d0, d1)>]}
-/// %1 = linalg.generic ins(%0#0, %0#1) {
-///     indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                      affine_map<(d0, d1) -> (d0, d1)]}
-/// ```
-///
-/// After tiling the first op we get
-///
-/// ```
-/// %0:2 = scf.forall ... {
-///   %1:2 = linalg.generic {
-///       indexing_maps = [....,
-///                        affine_map<(d0, d1, d2) -> (d0, d1),
-///                        affine_map<(d0, d1, d2) -> (d0, d1)>]}
-///   }
-/// }
-/// %2 = linalg.generic ins(%0#0, %0#1) {
-///     indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                      affine_map<(d0, d1) -> (d0, d1)]}
-/// ```
-///
-/// Due to a quirk of the fusion of consumers, fusing this consumer into the
-/// loop results in
-///
-/// ```
-/// %0:2 = scf.forall ... {
-///   %1:2 = linalg.generic {
-///       indexing_maps = [....,
-///                        affine_map<(d0, d1, d2) -> (d0, d1),
-///                        affine_map<(d0, d1, d2) -> (d0, d1)>]}
-///   %2 = tensor.extract_slice %0#1 [...]
-///   %3 = linalg.generic ins(%1#0, %2) {
-///       indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                        affine_map<(d0, d1) -> (d0, d1)]}
-///   }
-/// }
-/// ```
-///
-/// This is an SSA violation because of `%0#1` being used in the loop. This
-/// needs to be fixed upstream, but for cases where
-/// 1. The root operation produces results using an identity indexing map (when
-/// ignoring the iteration space dimensions corresponding to the reduction
-/// loops)
-/// 2. For all consumers of the results of the root operation, access the data
-/// using identity indexing map then for each consumer fusion step it is valid
-/// to replace all uses of slices of the outer loop that occur within the loop
-/// with the correponding tiled result value.
-/// This is a workaround till upstream transformation can fix this issue. The
-/// following method is testing if such a case exists to implement the
-/// work-around.
-bool warForConsumerFusionSSAViolation(
-    Operation *rootOp,
-    const llvm::SmallDenseSet<Operation *> &tiledAndFusedOps);
-
 /// Starting from `op` walk all operands backwards to find all
 /// potentially fusible operations, i.e. operations that implement
 /// the `TilingInterface`.
@@ -94,10 +34,12 @@ void collectTiledAndFusedOps(Operation *rootOp,
 // Returns a list of new `tensor.extract_slice` ops with new fusion
 // opportunities, as well as the new surrounding `scf.forall` (because consumer
 // fusion replaces the loop).
-FailureOr<std::queue<Operation *>>
-fuseConsumersIntoForall(RewriterBase &rewriter, Operation *tiledOp,
-                        MutableArrayRef<LoopLikeOpInterface> loops,
-                        bool useWARForConsumerFusionSSAViolation);
+FailureOr<std::queue<Operation *>> fuseConsumersIntoForall(
+    RewriterBase &rewriter, Operation *tiledOp,
+    MutableArrayRef<LoopLikeOpInterface> loops,
+    std::function<bool(Operation *)> filterFn = [](Operation *) {
+      return true;
+    });
 
 } // namespace mlir::iree_compiler
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -331,8 +331,6 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   mlir::DominanceInfo dominanceInfo(tilableOp);
   llvm::SmallDenseSet<Operation *> tiledAndFusedOps;
   collectTiledAndFusedOps(tilableOp, tiledAndFusedOps);
-  bool useWARForConsumerFusionSSAViolation =
-      warForConsumerFusionSSAViolation(tilableOp, tiledAndFusedOps);
 
   llvm::DenseSet<Operation *> yieldReplacementsFor;
   for (auto op : tiledAndFusedOps) {
@@ -413,13 +411,20 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
       return signalPassFailure();
     }
     for (auto [origValue, replacement] : tileAndFuseResult->replacements) {
-      rewriter.replaceAllUsesWith(origValue, replacement);
+      Value replacementCopy = replacement;
+      rewriter.replaceUsesWithIf(origValue, replacement, [&](OpOperand &use) {
+        Operation *user = use.getOwner();
+        return !isa<tensor::DimOp>(user) &&
+               dominanceInfo.dominates(replacementCopy, user);
+      });
     }
     std::swap(tileAndFuseResult->loops, tilingLoops);
     Operation *rootTiledOp = tileAndFuseResult->tiledAndFusedOps.front();
     FailureOr<std::queue<Operation *>> newFusionOpportunities =
         fuseConsumersIntoForall(rewriter, rootTiledOp, tilingLoops,
-                                useWARForConsumerFusionSSAViolation);
+                                [&tiledAndFusedOps](Operation *op) {
+                                  return tiledAndFusedOps.contains(op);
+                                });
     if (failed(newFusionOpportunities)) {
       rootTiledOp->emitOpError("failed to fuse consumers");
       return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir
@@ -826,7 +826,7 @@ func.func @pad_fusion(%0 : tensor<?x?xf32>, %1 : tensor<?x?xf32>, %2 : tensor<?x
 
 // -----
 
-// Test 1 of 2 that are testing a work-around for SSA violation issue with consumer fusion upstream.
+// Test 1 of 2 that are testing fusion while considering multiple slices.
 
 func.func @horizontal_fusion_consumer_fusion1(%arg0 : tensor<2x4096x640xf16>,
     %arg1 : tensor<10x64x640xf16>, %arg2 : tensor<10x64x640xf16>, %arg3 : tensor<10x64x640xf16>)
@@ -893,7 +893,7 @@ func.func @horizontal_fusion_consumer_fusion1(%arg0 : tensor<2x4096x640xf16>,
 
 // -----
 
-// Test 2 of 2 that are testing a work-around for SSA violation issue with consumer fusion upstream.
+// Test 2 of 2 that are testing fusion while considering multiple slices.
 
 func.func @horizontal_fusion_consumer_fusion2(%arg0 : tensor<2x4096x640xi8>,
     %arg1 : tensor<2x640x640xi8>, %arg2 : tensor<2x640x640xi8>) -> tensor<2x4096x640xf16> {
@@ -989,3 +989,87 @@ func.func @only_producer_fusion_multiple_result(%arg0: tensor<77x4096xf16>, %arg
 //       CHECK:     linalg.generic
 //       CHECK:     linalg.generic
 //       CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+
+// -----
+
+func.func @multi_slice_fusion_broadcast(%arg0: index, %arg1: tensor<3x?x32xi64>,
+     %arg2: tensor<256x32xf32>, %arg3: tensor<32xf32>)
+     -> (tensor<3x?x32x32xf32>, tensor<3x?x32x32xf32>) {
+  %c32 = arith.constant 32 : index
+  %c2_i64 = arith.constant 2 : i64
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 3.200000e+01 : f32
+  %cst_1 = arith.constant 9.000000e+00 : f32
+  %0 = arith.divsi %arg0, %c32 : index
+  %1 = affine.apply affine_map<()[s0] -> (s0 floordiv 32)>()[%arg0]
+  %2 = tensor.empty(%1) : tensor<3x?x32x32xf32>
+  %3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%arg1 : tensor<3x?x32xi64>) outs(%2 : tensor<3x?x32x32xf32>) {
+    ^bb0(%in: i64, %out: f32):
+      %8 = arith.index_cast %in : i64 to index
+      %9 = linalg.index 3 : index
+      %extracted = tensor.extract %arg2[%8, %9] : tensor<256x32xf32>
+      linalg.yield %extracted : f32
+    } -> tensor<3x?x32x32xf32>
+  %4 = tensor.empty(%0) : tensor<3x?x32xf32>
+  %5 = linalg.fill ins(%cst : f32)outs(%4 : tensor<3x?x32xf32>) -> tensor<3x?x32xf32>
+  %6 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel", "reduction"]} 
+      ins(%3 : tensor<3x?x32x32xf32>) outs(%5 : tensor<3x?x32xf32>)
+      attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4], thread = [1, 1, 1, 0], workgroup = [1, 1, 64, 0]}>} {
+  ^bb0(%in: f32, %out: f32):
+    %8 = math.fpowi %in, %c2_i64 : f32, i64
+    %9 = arith.addf %8, %out : f32
+    linalg.yield %9 : f32
+  } -> tensor<3x?x32xf32>
+  %7 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%arg3, %3, %6 : tensor<32xf32>, tensor<3x?x32x32xf32>, tensor<3x?x32xf32>)
+      outs(%2 : tensor<3x?x32x32xf32>) {
+  ^bb0(%in: f32, %in_2: f32, %in_3: f32, %out: f32):
+    %8 = arith.divf %in_3, %cst_0 : f32
+    %9 = arith.addf %8, %cst_1 : f32
+    %10 = math.rsqrt %9 : f32
+    %11 = arith.mulf %in_2, %10 : f32
+    %12 = arith.mulf %in, %11 : f32
+    linalg.yield %12 : f32
+  } -> tensor<3x?x32x32xf32>
+  return %3, %7 : tensor<3x?x32x32xf32>, tensor<3x?x32x32xf32>
+}
+// CHECK-LABEL: func @multi_slice_fusion_broadcast
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<3x?x32xi64>
+//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: tensor<32xf32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C32:.+]] = arith.constant 32 : index
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty
+//       CHECK:    %[[RESULT:.+]]:2 = scf.forall (%[[IV0:[a-zA-Z0-9]+]], %[[IV1:[a-zA-Z0-9]+]])
+//  CHECK-SAME:        shared_outs(%[[INIT0:[a-zA-Z0-9]+]] = %[[EMPTY]], %[[INIT1:[a-zA-Z0-9]+]] = %[[EMPTY]])
+//   CHECK-DAG:      %[[INIT0_SLICE:.+]] = tensor.extract_slice %[[INIT0]][%[[IV0]], %[[IV1]], 0, 0] [1, 1, 32, 32]
+//   CHECK-DAG:      %[[ARG1_SLICE:.+]] = tensor.extract_slice %[[ARG1]][%[[IV0]], %[[IV1]], 0] [1, 1, 32]
+//       CHECK:      %[[GENERIC0:.+]] = linalg.generic
+//  CHECK-SAME:          ins(%[[ARG1_SLICE]] :
+//  CHECK-SAME:          outs(%[[INIT0_SLICE]] :
+//       CHECK:      %[[CAST0:.+]] = tensor.cast %[[GENERIC0]]
+//       CHECK:      %[[EMPTYTILE:.+]] = tensor.empty() : tensor<1x1x32xf32>
+//       CHECK:      %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:          outs(%[[EMPTYTILE]] :
+//       CHECK:      %[[GENERIC1:.+]] = linalg.generic
+//  CHECK-SAME:          ins(%[[GENERIC0]] :
+//  CHECK-SAME:          outs(%[[FILL]] :
+//       CHECK:      %[[INIT1_SLICE:.+]] = tensor.extract_slice %[[INIT1]][%[[IV0]], %[[IV1]], 0, 0] [1, 1, 32, 32] [1, 1, 1, 1]
+//       CHECK:      %[[GENERIC2:.+]] = linalg.generic
+//  CHECK-SAME:          ins(%[[ARG3]], %[[GENERIC0]], %[[GENERIC1]] :
+//  CHECK-SAME:          outs(%[[INIT1_SLICE]] :
+//       CHECK:      %[[CAST1:.+]] = tensor.cast %[[GENERIC2]]
+//   CHECK-DAG:      tensor.parallel_insert_slice %[[CAST0]] into %[[INIT0]][%[[IV0]], %[[IV1]], %[[C0]], 0] [1, 1, %[[C32]], 32]
+//   CHECK-DAG:      tensor.parallel_insert_slice %[[CAST1]] into %[[INIT1]][%[[IV0]], %[[IV1]], %[[C0]], 0] [1, 1, %[[C32]], 32]
+//       CHECK:    return %[[RESULT]]#0, %[[RESULT]]#1