[mlir][bufferization]-Replace only one use in TensorEmptyElimination

amirBish · amirBish · commit cc099096617d · 2024-12-06T14:22:14.000+02:00
This MR hanldes the second case where we want to replace only the
specific use which was visited in the `use-def` chain (when
traversing from the tensor.insert_slice's source).

This scenario of replacing all the uses of the tensor.empty may lead
into additional read effects after bufferization of the specific subset
extract/subview which should not be the case, Thus eliminating a
potential copies.
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -459,7 +459,8 @@ class AnalysisState {
   /// Starting from `value`, follow the use-def chain in reverse, always
   /// selecting the aliasing OpOperands. Find and return Values for which
   /// `condition` evaluates to true. OpOperands of such matching Values are not
-  /// traversed any further.
+  /// traversed any further, The visited aliasing opOperands will be preserved
+  /// through `visitedOpOperands`.
   ///
   /// When reaching the end of a chain, also return the last Value of that
   /// chain if `config.alwaysIncludeLeaves` is set.
@@ -484,7 +485,8 @@ class AnalysisState {
   /// `config`.
   SetVector<Value> findValueInReverseUseDefChain(
       Value value, llvm::function_ref<bool(Value)> condition,
-      TraversalConfig config = TraversalConfig()) const;
+      TraversalConfig config = TraversalConfig(),
+      llvm::DenseSet<OpOperand*> *visitedOpOperands = nullptr) const;
 
   /// Find the values that may define the contents of the given value at
   /// runtime. A block argument is always a definition. An OpResult is a
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -483,10 +483,12 @@ bool AnalysisState::isValueRead(Value value) const {
 // Starting from `value`, follow the use-def chain in reverse, always selecting
 // the aliasing OpOperands. Find and return Values for which `condition`
 // evaluates to true. OpOperands of such matching Values are not traversed any
-// further.
+// further, The visited aliasing opOperands will be preserved through
+// `visitedOpOperands`.
 llvm::SetVector<Value> AnalysisState::findValueInReverseUseDefChain(
     Value value, llvm::function_ref<bool(Value)> condition,
-    TraversalConfig config) const {
+    TraversalConfig config,
+    llvm::DenseSet<OpOperand*> *visitedOpOperands) const {
   llvm::DenseSet<Value> visited;
   llvm::SetVector<Value> result, workingSet;
   workingSet.insert(value);
@@ -553,6 +555,8 @@ llvm::SetVector<Value> AnalysisState::findValueInReverseUseDefChain(
       }
 
       workingSet.insert(a.opOperand->get());
+      if (visitedOpOperands)
+        visitedOpOperands->insert(a.opOperand);
     }
   }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -57,44 +57,40 @@ neededValuesDominateInsertionPoint(const DominanceInfo &domInfo,
   return true;
 }
 
-/// Return true if the given `insertionPoint` dominates all uses of
-/// `emptyTensorOp`.
-static bool insertionPointDominatesUses(const DominanceInfo &domInfo,
-                                        Operation *insertionPoint,
-                                        Operation *emptyTensorOp) {
-  return llvm::all_of(emptyTensorOp->getUsers(), [&](Operation *user) {
-    return domInfo.dominates(insertionPoint, user);
-  });
-}
-
-/// Find a valid insertion point for a replacement of `emptyTensorOp`, assuming
+/// Find a valid insertion point for a replacement of `useToBeEliminated`, assuming
 /// that the replacement may use any value from `neededValues`.
 static Operation *
-findValidInsertionPoint(Operation *emptyTensorOp,
+findValidInsertionPoint(OpOperand *useToBeEliminated,
                         const SmallVector<Value> &neededValues) {
   DominanceInfo domInfo;
 
+  Operation * candidateInsertionPoint = useToBeEliminated->getOwner();
+  assert(isa<OpResult>(useToBeEliminated->get()) && "expected a result value");
+  // Both `tensor.empty` and its user are within different blocks.
+  if (useToBeEliminated->getOwner()->getBlock() != useToBeEliminated->get().getDefiningOp()->getBlock())
+    candidateInsertionPoint = useToBeEliminated->get().getDefiningOp();
+
   // Trying to move the needed values before the `emptyTensorOp`.
   for (Value val : neededValues) {
-    if (valueDominateInsertionPoint(domInfo, emptyTensorOp, val))
+    if (valueDominateInsertionPoint(domInfo, candidateInsertionPoint, val))
       continue;
     Operation *definingOp = val.getDefiningOp();
     if (!definingOp)
       continue;
 
     bool isItSafeToMoveOp =
         llvm::all_of(definingOp->getOperands(), [&](Value operand) {
-          return valueDominateInsertionPoint(domInfo, emptyTensorOp, operand);
+          return valueDominateInsertionPoint(domInfo, candidateInsertionPoint, operand);
         });
 
     if (isItSafeToMoveOp)
-      definingOp->moveBefore(emptyTensorOp);
+      definingOp->moveBefore(candidateInsertionPoint);
   }
 
-  // Gather all possible insertion points: the location of `emptyTensorOp` and
+  // Gather all possible insertion points: the location of `candidateInsertionPoint` and
   // right after the definition of each value in `neededValues`.
   SmallVector<Operation *> insertionPointCandidates;
-  insertionPointCandidates.push_back(emptyTensorOp);
+  insertionPointCandidates.push_back(candidateInsertionPoint);
   for (Value val : neededValues) {
     // Note: The anchor op is using all of `neededValues`, so:
     // * in case of a block argument: There must be at least one op in the block
@@ -116,8 +112,8 @@ findValidInsertionPoint(Operation *emptyTensorOp,
     if (!neededValuesDominateInsertionPoint(domInfo, insertionPoint,
                                             neededValues))
       continue;
-    // Check if the insertion point is before all uses.
-    if (!insertionPointDominatesUses(domInfo, insertionPoint, emptyTensorOp))
+    // Check if the insertion point is before the use to be replaced.
+    if (!domInfo.dominates(insertionPoint, useToBeEliminated->getOwner()))
       continue;
     return insertionPoint;
   }
@@ -129,8 +125,9 @@ findValidInsertionPoint(Operation *emptyTensorOp,
 LogicalResult mlir::bufferization::eliminateEmptyTensors(
     RewriterBase &rewriter, Operation *op, OneShotAnalysisState &state) {
   OpBuilder::InsertionGuard g(rewriter);
-
+  llvm::DenseSet<OpOperand *> visitedOpOperands;
   op->walk([&](SubsetInsertionOpInterface op) {
+    visitedOpOperands.clear();
     OpOperand &source = op.getSourceOperand();
     // Skip operands that do not bufferize inplace. "tensor.empty" could still
     // be replaced, but the transformation may not be beneficial.
@@ -158,15 +155,23 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
     SetVector<Value> emptyTensors = state.findValueInReverseUseDefChain(
         source.get(), /*condition=*/
         [&](Value val) { return val.getDefiningOp<tensor::EmptyOp>(); },
-        config);
+        config, &visitedOpOperands);
 
     for (Value v : emptyTensors) {
       Operation *emptyTensorOp = v.getDefiningOp();
 
+      // Find the use to be replaced from the use-def chain
+      auto iter = llvm::find_if(visitedOpOperands, [&emptyTensorOp](OpOperand *opOperand){
+        return llvm::count(emptyTensorOp->getUses(), *opOperand);
+      });
+      if (iter == visitedOpOperands.end())
+        continue;
+      OpOperand *useToBeReplaced = *iter;
+
       // Find a suitable insertion point. If no suitable insertion point for
       // the replacement can be found, skip this replacement.
       Operation *insertionPoint =
-          findValidInsertionPoint(emptyTensorOp, neededValues);
+          findValidInsertionPoint(useToBeReplaced, neededValues);
       if (!insertionPoint)
         continue;
 
@@ -185,8 +190,8 @@ LogicalResult mlir::bufferization::eliminateEmptyTensors(
         replacement = rewriter.create<tensor::CastOp>(v.getLoc(), v.getType(),
                                                       replacement);
       }
-      // Replace the tensor::EmptyOp.
-      rewriter.replaceOp(emptyTensorOp, replacement);
+      // Replace the specific use of the tensor::EmptyOp.
+      useToBeReplaced->getOwner()->setOperand(useToBeReplaced->getOperandNumber(), replacement);
       state.resetCache();
     }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-empty-tensor-elimination.mlir
@@ -52,9 +52,8 @@ func.func @buffer_forwarding_no_conflict(%arg0: tensor<?xf32> {bufferization.wri
 
 // CHECK-LABEL: func @buffer_forwarding_conflict_with_different_element_type
 func.func @buffer_forwarding_conflict_with_different_element_type(%arg0: tensor<?xf32> {bufferization.writable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
   %cst = arith.constant 0.000000e+00 : f32
+  //      CHECK: bufferization.alloc_tensor(%arg1)
   %0 = tensor.empty(%arg1) : tensor<?xf32>
 
   //      CHECK: bufferization.alloc_tensor(%arg1)
@@ -64,6 +63,10 @@ func.func @buffer_forwarding_conflict_with_different_element_type(%arg0: tensor<
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
   %2 = linalg.copy ins(%0 : tensor<?xf32>) outs(%1 : tensor<?xbf16>) -> tensor<?xbf16>
 
+
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
+
   //      CHECK: linalg.copy
   // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]
   %3 = linalg.copy ins(%2 : tensor<?xbf16>) outs(%0 : tensor<?xf32>) -> tensor<?xf32>
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -396,7 +396,7 @@ func.func @fail_to_eliminate_any_empty_tensors() -> tensor<5x6x128xf32> {
 func.func @succeed_to_eliminate_one_empty_tensor() -> tensor<5x6x128xf32> {
   %cst_1 = arith.constant 1.0 : f32
   %cst_2 = arith.constant 2.0 : f32
-  // CHECK: memref.alloc
+  // CHECK: memref.alloc() {alignment = 64 : i64} : memref<5x6x128xf32>
   // CHECK-NOT: memref.alloc
   %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
   %empty_1 = tensor.empty() : tensor<5x6x64xf32>
@@ -413,10 +413,9 @@ func.func @succeed_to_eliminate_one_empty_tensor() -> tensor<5x6x128xf32> {
 
 // -----
 
-// `EmptyTensorElimination` replaces all of the uses of the tensor
-// empty with the new injected `SubsetExtraction`, without to consider
-// the specific use has been tracked, sometimes creating a non existent
-// bufferization conflicts.
+// `EmptyTensorElimination` will replace the specific use of the tensor
+// empty with the new injected `SubsetExtraction`, i.e. the specific use
+// which has been tracked.
 
 // CHECK-ELIM-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
 // CHECK-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
@@ -425,15 +424,16 @@ func.func @mutli_use_of_the_same_tensor_empty() -> tensor<5x6x128xf32> {
   %cst_2 = arith.constant 2.0 : f32
   %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
   %empty_1 = tensor.empty() : tensor<5x6x64xf32>
-  // CHECK-ELIM: %[[VAL_3:.*]] = tensor.extract_slice
-  // CHECK-ELIM: linalg.fill ins(%[[VAL_0:.*]] : f32) outs(%[[VAL_3]]
-  // CHECK-ELIM: linalg.fill ins(%[[VAL_1:.*]] : f32) outs(%[[VAL_3]]
+  // CHECK-ELIM: %[[VAL_4:.*]] = tensor.extract_slice %[[VAL_2:.*]]
+  // CHECK-ELIM: linalg.fill ins(%[[VAL_0:.*]] : f32) outs(%[[VAL_4]] : tensor<5x6x64xf32>)
+  // CHECK-ELIM: %[[VAL_6:.*]] = tensor.insert_slice
   %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  // CHECK-ELIM: %[[VAL_7:.*]] = tensor.extract_slice %[[VAL_6]]
+  // CHECK-ELIM: %[[VAL_8:.*]] = linalg.fill ins(%[[VAL_1:.*]] : f32) outs(%[[VAL_7]] : tensor<5x6x64xf32>)
   %res_2 = linalg.fill ins(%cst_2 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
-  // CHECK: memref.copy
+  // CHECK-NOT: memref.copy
   %inserted_slice_1 = tensor.insert_slice %res_1 into %cancatenated_empty[0, 0, 0][5, 6, 64][1, 1, 1]
       : tensor<5x6x64xf32> into tensor<5x6x128xf32>
-  // CHECK: memref.copy
   %inserted_slice_2 = tensor.insert_slice %res_2 into %inserted_slice_1[0, 0, 64][5, 6, 64][1, 1, 1]
       : tensor<5x6x64xf32> into tensor<5x6x128xf32>
   return %inserted_slice_2 : tensor<5x6x128xf32>
@@ -446,7 +446,8 @@ func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: t
     -> (tensor<5x6x128xf32>, tensor<5x6x64xf32>) {
   %cst_1 = arith.constant 1.0 : f32
   %empty_1 = tensor.empty() : tensor<5x6x64xf32>
-  // CHECK: memref.alloc
+  // CHECK: memref.alloc() {alignment = 64 : i64} : memref<5x6x64xf32>
+  // CHECK-NOT: memref.alloc
   %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
   %res_2 = linalg.generic{
     indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
@@ -458,7 +459,7 @@ func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: t
     %res = arith.addf %in, %in : f32
     linalg.yield %res : f32
   } -> tensor<5x6x64xf32>
-  // CHECK: memref.copy
+  // CHECK-NOT: memref.copy
   %inserted_slice_1 = tensor.insert_slice %res_1 into %arg1[0, 0, 0][5, 6, 64][1, 1, 1]
       : tensor<5x6x64xf32> into tensor<5x6x128xf32>
   return %inserted_slice_1, %res_2 : tensor<5x6x128xf32>, tensor<5x6x64xf32>