iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp‎
Lines changed: 54 additions & 94 deletions b/‎compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp‎
Lines changed: 54 additions & 94 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.h‎
Lines changed: 12 additions & 72 deletions b/‎compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.h‎
Lines changed: 12 additions & 72 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp‎
Lines changed: 10 additions & 5 deletions b/‎compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp‎
Lines changed: 10 additions & 5 deletions
@@ -8,6 +8,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 
@@ -51,36 +52,6 @@ void fuseProducersOfSlices(RewriterBase &rewriter,
   }
 }
 
-bool warForConsumerFusionSSAViolation(
-    Operation *rootOp,
-    const llvm::SmallDenseSet<Operation *> &tiledAndFusedOps) {
-  auto linalgRootOp = dyn_cast<linalg::LinalgOp>(rootOp);
-  if (!linalgRootOp) {
-    return false;
-  }
-  SmallVector<utils::IteratorType> iteratorTypes =
-      linalgRootOp.getIteratorTypesArray();
-  for (AffineMap map :
-       llvm::map_range(linalgRootOp.getIndexingMaps(), [](Attribute attr) {
-         return cast<AffineMapAttr>(attr).getValue();
-       })) {
-    if (!compressUnusedDims(map).isIdentity()) {
-      return false;
-    }
-  }
-
-  for (OpOperand &use : linalgRootOp->getUses()) {
-    auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
-    if (!linalgUser) {
-      return false;
-    }
-    if (!linalgUser.getMatchingIndexingMap(&use).isIdentity()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 void collectTiledAndFusedOps(Operation *rootOp,
                              llvm::SmallDenseSet<Operation *> &result) {
   SmallVector<Operation *> worklist;
@@ -111,82 +82,72 @@ void collectTiledAndFusedOps(Operation *rootOp,
 }
 
 FailureOr<std::queue<Operation *>>
-fuseConsumersIntoLoops(RewriterBase &rewriter, Operation *tiledOp,
-                       MutableArrayRef<LoopLikeOpInterface> loops,
-                       bool useWARForConsumerFusionSSAViolation) {
-  auto addCandidateSlices = [](Operation *fusedOp,
-                               std::queue<Operation *> &candidates) {
+fuseConsumersIntoForall(RewriterBase &rewriter, Operation *tiledOp,
+                        MutableArrayRef<LoopLikeOpInterface> loops,
+                        std::function<bool(Operation *)> filterFn) {
+  // Collect the candidate slices which can be potential consumers that can be
+  // fused.
+  std::queue<SmallVector<Operation *>> candidates;
+  llvm::SmallDenseSet<tensor::ParallelInsertSliceOp> allCandidates;
+  auto addCandidateSlices = [&candidates, &allCandidates,
+                             &filterFn](Operation *fusedOp) {
     for (auto *userOp : fusedOp->getResults().getUsers()) {
-      if (llvm::isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
-              userOp)) {
-        // Users of tiledOp should either be all of type `tensor.insert_slice`
-        // or all of`tensor.parallel_insert_slice`.
-        //
-        // Pattern 1 - tileing with scf.for:
-        //   %out = scf.for ... {
-        //     %0 = scf.for ... {
-        //       %t0 = op
-        //       %t1 = op  %t0                 // <- `tiledOp`
-        //       %1 = tensor.insert_slice %t1
-        //       yield %1
-        //     }
-        //     yield %0
-        //   }
-        //
-        // Pattern 2 - tiling with scf.forall:
-        //   % out = scf.forall ... {
-        //       %t0 = op
-        //       %t1 = op  %t0                 // <- `tiledOp`
-        //       scf.forall.in_parallel {
-        //         tensor.parallel_insert_slice %tile
-        //       }
-        //   }
-        assert((candidates.empty() ||
-                candidates.front()->getName() == userOp->getName()) &&
-               "expected all slice users to be of type tensor.insert_slice "
-               "or of tensor.parallel_insert_slice.");
-        candidates.push(userOp);
+      auto sliceOp = dyn_cast<tensor::ParallelInsertSliceOp>(userOp);
+      if (!sliceOp || allCandidates.contains(sliceOp)) {
+        continue;
+      }
+
+      auto currLoop =
+          cast<scf::ForallOp>(sliceOp->getParentOp()->getParentOp());
+      OpResult loopResult = currLoop.getTiedOpResult(
+          currLoop.getTiedOpOperand(cast<BlockArgument>(sliceOp.getDest())));
+      SmallVector<Operation *> users = llvm::to_vector(
+          llvm::make_filter_range(loopResult.getUsers(), filterFn));
+      if (users.empty()) {
+        continue;
+      }
+      mlir::computeTopologicalSorting(users);
+
+      Operation *fusableUser = users.front();
+      // Check all operands from the `scf.forall`
+      SmallVector<OpResult> loopResults;
+      for (OpOperand &opOperand : fusableUser->getOpOperands()) {
+        if (opOperand.get().getDefiningOp() == currLoop.getOperation()) {
+          loopResults.push_back(cast<OpResult>(opOperand.get()));
+        }
+      }
+
+      SmallVector<Operation *> fusedSlices;
+      for (OpResult result : loopResults) {
+        BlockArgument tiedBlockArg =
+            currLoop.getTiedBlockArgument(currLoop.getTiedOpOperand(result));
+        SmallVector<tensor::ParallelInsertSliceOp> slices = llvm::map_to_vector(
+            currLoop.getCombiningOps(tiedBlockArg), [](Operation *op) {
+              return cast<tensor::ParallelInsertSliceOp>(op);
+            });
+        llvm::append_range(fusedSlices, slices);
+        allCandidates.insert_range(slices);
+      }
+      if (!fusedSlices.empty()) {
+        candidates.emplace(std::move(fusedSlices));
       }
     }
   };
 
-  // Collect the candidate slices which can be potential consumers that can be
-  // fused.
-  std::queue<Operation *> candidates;
-  addCandidateSlices(tiledOp, candidates);
+  addCandidateSlices(tiledOp);
 
   std::queue<Operation *> newFusionOpportunities;
   while (!candidates.empty()) {
     // Traverse the slices in BFS fashion.
-    Operation *candidateSliceOp = candidates.front();
+    SmallVector<Operation *> candidateSlices = candidates.front();
     candidates.pop();
 
     FailureOr<scf::SCFFuseConsumerOfSliceResult> fusedResult =
-        mlir::scf::tileAndFuseConsumerOfSlices(rewriter, candidateSliceOp,
+        mlir::scf::tileAndFuseConsumerOfSlices(rewriter, candidateSlices,
                                                loops);
     if (failed(fusedResult)) {
-      LLVM_DEBUG(llvm::dbgs() << "failed to fuse consumer of slice: "
-                              << candidateSliceOp << "\n");
-      continue;
-    }
-
-    // Implement the WAR for consumer fusion SSA violation (as described in the
-    // comments for `warForConsumerFusionSSAViolation`)
-    if (useWARForConsumerFusionSSAViolation) {
-      for (auto [tiledOpResult, loopResult] :
-           llvm::zip(tiledOp->getResults(), loops.back()->getResults())) {
-        for (OpOperand &use : loopResult.getUses()) {
-          Operation *user = use.getOwner();
-          if (user->getParentOp() != loops.back()) {
-            continue;
-          }
-          auto slice = dyn_cast<tensor::ExtractSliceOp>(user);
-          if (!slice) {
-            return failure();
-          }
-          rewriter.replaceAllOpUsesWith(slice, tiledOpResult);
-        }
-      }
+      return candidateSlices.front()->emitOpError(
+          "failed to fuse consumer of slice");
     }
 
     // Replace the original consumer operation with the tiled implementation.
@@ -197,8 +158,7 @@ fuseConsumersIntoLoops(RewriterBase &rewriter, Operation *tiledOp,
     // values produced by operations that implement the `TilingInterface`.
     // Add these operations to the worklist.
     addCandidateSlices(
-        fusedResult->tiledAndFusedConsumerOperands.front()->getOwner(),
-        candidates);
+        fusedResult->tiledAndFusedConsumerOperands.front()->getOwner());
 
     // Add the list of new producer fusion opportunities.
     for (auto tiledOp : fusedResult.value().tiledOps) {
 
@@ -23,83 +23,23 @@ void fuseProducersOfSlices(RewriterBase &rewriter,
                            scf::SCFTileAndFuseOptions &options,
                            MutableArrayRef<LoopLikeOpInterface> loops);
 
-/// Consider the following case
-///
-/// ```mlir
-/// %0:2 = linalg.generic {
-///     indexing_maps = [....,
-///                      affine_map<(d0, d1, d2) -> (d0, d1),
-///                      affine_map<(d0, d1, d2) -> (d0, d1)>]}
-/// %1 = linalg.generic ins(%0#0, %0#1) {
-///     indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                      affine_map<(d0, d1) -> (d0, d1)]}
-/// ```
-///
-/// After tiling the first op we get
-///
-/// ```
-/// %0:2 = scf.forall ... {
-///   %1:2 = linalg.generic {
-///       indexing_maps = [....,
-///                        affine_map<(d0, d1, d2) -> (d0, d1),
-///                        affine_map<(d0, d1, d2) -> (d0, d1)>]}
-///   }
-/// }
-/// %2 = linalg.generic ins(%0#0, %0#1) {
-///     indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                      affine_map<(d0, d1) -> (d0, d1)]}
-/// ```
-///
-/// Due to a quirk of the fusion of consumers, fusing this consumer into the
-/// loop results in
-///
-/// ```
-/// %0:2 = scf.forall ... {
-///   %1:2 = linalg.generic {
-///       indexing_maps = [....,
-///                        affine_map<(d0, d1, d2) -> (d0, d1),
-///                        affine_map<(d0, d1, d2) -> (d0, d1)>]}
-///   %2 = tensor.extract_slice %0#1 [...]
-///   %3 = linalg.generic ins(%1#0, %2) {
-///       indexing_maps = [affine_map<(d0, d1) -> (d0, d1),
-///                        affine_map<(d0, d1) -> (d0, d1)]}
-///   }
-/// }
-/// ```
-///
-/// This is an SSA violation because of `%0#1` being used in the loop. This
-/// needs to be fixed upstream, but for cases where
-/// 1. The root operation produces results using an identity indexing map (when
-/// ignoring the iteration space dimensions corresponding to the reduction
-/// loops)
-/// 2. For all consumers of the results of the root operation, access the data
-/// using identity indexing map then for each consumer fusion step it is valid
-/// to replace all uses of slices of the outer loop that occur within the loop
-/// with the correponding tiled result value.
-/// This is a workaround till upstream transformation can fix this issue. The
-/// following method is testing if such a case exists to implement the
-/// work-around.
-bool warForConsumerFusionSSAViolation(
-    Operation *rootOp,
-    const llvm::SmallDenseSet<Operation *> &tiledAndFusedOps);
-
 /// Starting from `op` walk all operands backwards to find all
 /// potentially fusible operations, i.e. operations that implement
 /// the `TilingInterface`.
 void collectTiledAndFusedOps(Operation *rootOp,
                              llvm::SmallDenseSet<Operation *> &result);
-/// Fuses consumers of `tiledOp` into the surrounding `loops`.
-///
-/// For any previous producer consumer fusion it's expected that `tiledOp` was
-/// the consumer into which producers were fused, i.e. `loops` shouldn't contain
-/// a consumer of `tiledOp` that isn't an insert_slice op.
-/// `fuseConsumersIntoLoops` will fuse consumers of `tiledOp` into surrounding
-/// `scf.forall` or `scf.for` loops and return a list of slice ops that expose
-/// new fusion opportunities.
-FailureOr<std::queue<Operation *>>
-fuseConsumersIntoLoops(RewriterBase &rewriter, Operation *tiledOp,
-                       MutableArrayRef<LoopLikeOpInterface> loops,
-                       bool useWARForConsumerFusionSSAViolation);
+
+/// Fuse all consumers of the given `tiledOp` into the surrounding `scf.forall`.
+/// Returns a list of new `tensor.extract_slice` ops with new fusion
+/// opportunities, as well as the new surrounding `scf.forall` (because consumer
+/// fusion replaces the loop).
+FailureOr<std::queue<Operation *>> fuseConsumersIntoForall(
+    RewriterBase &rewriter, Operation *tiledOp,
+    MutableArrayRef<LoopLikeOpInterface> loops,
+    std::function<bool(Operation *)> filterFn = [](Operation *) {
+      return true;
+    });
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_COMMON_TILEANDFUSEUTILS_H_
@@ -331,8 +331,6 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   mlir::DominanceInfo dominanceInfo(tilableOp);
   llvm::SmallDenseSet<Operation *> tiledAndFusedOps;
   collectTiledAndFusedOps(tilableOp, tiledAndFusedOps);
-  bool useWARForConsumerFusionSSAViolation =
-      warForConsumerFusionSSAViolation(tilableOp, tiledAndFusedOps);
 
   llvm::DenseSet<Operation *> yieldReplacementsFor;
   for (auto op : tiledAndFusedOps) {
@@ -413,13 +411,20 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
       return signalPassFailure();
     }
     for (auto [origValue, replacement] : tileAndFuseResult->replacements) {
-      rewriter.replaceAllUsesWith(origValue, replacement);
+      Value replacementCopy = replacement;
+      rewriter.replaceUsesWithIf(origValue, replacement, [&](OpOperand &use) {
+        Operation *user = use.getOwner();
+        return !isa<tensor::DimOp>(user) &&
+               dominanceInfo.dominates(replacementCopy, user);
+      });
     }
     std::swap(tileAndFuseResult->loops, tilingLoops);
     Operation *rootTiledOp = tileAndFuseResult->tiledAndFusedOps.front();
     FailureOr<std::queue<Operation *>> newFusionOpportunities =
-        fuseConsumersIntoLoops(rewriter, rootTiledOp, tilingLoops,
-                               useWARForConsumerFusionSSAViolation);
+        fuseConsumersIntoForall(rewriter, rootTiledOp, tilingLoops,
+                                [&tiledAndFusedOps](Operation *op) {
+                                  return tiledAndFusedOps.contains(op);
+                                });
     if (failed(newFusionOpportunities)) {
       rootTiledOp->emitOpError("failed to fuse consumers");
       return signalPassFailure();