[mlir][bufferization]-Add lit tests for unhandled cases in EmptyTensorElimination

amirBish · amirBish · commit c4577b1422ee · 2024-12-11T17:46:45.000+02:00
In many cases the emptyTensorElimination can not transform or
eliminate the empty tensor which is being inserted into the
`SubsetInsertionOpInterface`.

Two major reasons for that:

1- Failing when trying to find a legal/suitable insertion point
for the `subsetExtract` which is about to replace the empty tensor.
However, we may try to handle this issue by moving the needed values
which responsible on building the  `subsetExtract` nearby the empty
tensor (which is about to be eliminated). Thus increasing the
probability to find a legal insertion point.

2-The EmptyTensorElimination transform replaces the tensor.empty's uses
all at once in one apply, rather than replacing only the specific use
which was visited in the use-def chain (when traversing from the tensor.insert_slice).
This scenario of replacing all the uses of the tensor.empty may lead
into additional read effects after bufferization of the specific subset
extract/subview which should not be the case.

Both cases may result in many copies in the coming bufferization
which can not be canonicalized.

The first case can be noticed when having a `tensor.empty` followed by
`SubsetInsertionOpInterface` (or in simple words `tensor.insert_slice`),
which have been lowered from `tensor/tosa.concat`.

The second case can be noticed when having a `tensor.empty`, with many
uses and leading to applying the transformation only once, since the
whole uses have been replaced at once.

This MR only adds the lit tests for the cases shown above (NFC),
to emphasize how the transform works, in the coming MRs will upload
a slight changes to handle these case.
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -365,3 +365,101 @@ func.func @multiple_materialize_in_destination_buffer(%m: memref<5xf32>, %f: f32
   bufferization.materialize_in_destination %selected in restrict writable %m : (tensor<5xf32>, memref<5xf32>) -> ()
   return
 }
+
+// -----
+
+// `EmptyTensorElimination` fails to find a valid insertion
+// point for the new injected `SubsetExtraction`.
+// CHECK-LABEL:   func.func @fail_to_eliminate_any_empty_tensors
+func.func @fail_to_eliminate_any_empty_tensors() -> tensor<5x6x128xf32> {
+  %cst_1 = arith.constant 1.0 : f32
+  %cst_2 = arith.constant 2.0 : f32
+  // CHECK: memref.alloc
+  // CHECK: memref.alloc
+  // CHECK: memref.alloc
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  %empty_2 = tensor.empty() : tensor<5x6x64xf32>
+  %res_2 = linalg.fill ins(%cst_2 : f32) outs(%empty_2 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
+  // CHECK: memref.copy
+  %inserted_slice_1 = tensor.insert_slice %res_1 into %cancatenated_empty[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  %inserted_slice_2 = tensor.insert_slice %res_2 into %inserted_slice_1[0, 0, 64][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_2 : tensor<5x6x128xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @succeed_to_eliminate_one_empty_tensor
+func.func @succeed_to_eliminate_one_empty_tensor() -> tensor<5x6x128xf32> {
+  %cst_1 = arith.constant 1.0 : f32
+  %cst_2 = arith.constant 2.0 : f32
+  // CHECK: memref.alloc
+  // CHECK: memref.alloc
+  %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  %empty_2 = tensor.empty() : tensor<5x6x64xf32>
+  %res_2 = linalg.fill ins(%cst_2 : f32) outs(%empty_2 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  // CHECK: memref.copy
+  %inserted_slice_1 = tensor.insert_slice %res_1 into %cancatenated_empty[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  %inserted_slice_2 = tensor.insert_slice %res_2 into %inserted_slice_1[0, 0, 64][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_2 : tensor<5x6x128xf32>
+}
+
+// -----
+
+// `EmptyTensorElimination` replaces all of the uses of the tensor
+// empty with the new injected `SubsetExtraction`, without to consider
+// the specific use has been tracked, sometimes creating a non existent
+// bufferization conflicts.
+
+// CHECK-ELIM-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
+// CHECK-LABEL:   func.func @mutli_use_of_the_same_tensor_empty
+func.func @mutli_use_of_the_same_tensor_empty() -> tensor<5x6x128xf32> {
+  %cst_1 = arith.constant 1.0 : f32
+  %cst_2 = arith.constant 2.0 : f32
+  %cancatenated_empty = tensor.empty() : tensor<5x6x128xf32>
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  // CHECK-ELIM: %[[VAL_3:.*]] = tensor.extract_slice
+  // CHECK-ELIM: linalg.fill ins(%[[VAL_0:.*]] : f32) outs(%[[VAL_3]]
+  // CHECK-ELIM: linalg.fill ins(%[[VAL_1:.*]] : f32) outs(%[[VAL_3]]
+  %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  %res_2 = linalg.fill ins(%cst_2 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  // CHECK: memref.copy
+  %inserted_slice_1 = tensor.insert_slice %res_1 into %cancatenated_empty[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  // CHECK: memref.copy
+  %inserted_slice_2 = tensor.insert_slice %res_2 into %inserted_slice_1[0, 0, 64][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_2 : tensor<5x6x128xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read
+func.func @mutli_use_of_the_same_tensor_empty_creates_non_existent_read(%arg1: tensor<5x6x128xf32> , %arg2: tensor<5x6x64xf32>)
+    -> (tensor<5x6x128xf32>, tensor<5x6x64xf32>) {
+  %cst_1 = arith.constant 1.0 : f32
+  %empty_1 = tensor.empty() : tensor<5x6x64xf32>
+  // CHECK: memref.alloc
+  %res_1 = linalg.fill ins(%cst_1 : f32) outs(%empty_1 : tensor<5x6x64xf32>) -> tensor<5x6x64xf32>
+  %res_2 = linalg.generic{
+    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  }
+  ins(%empty_1 : tensor<5x6x64xf32>)
+  outs(%arg2 :tensor<5x6x64xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %res = arith.addf %in, %in : f32
+    linalg.yield %res : f32
+  } -> tensor<5x6x64xf32>
+  // CHECK: memref.copy
+  %inserted_slice_1 = tensor.insert_slice %res_1 into %arg1[0, 0, 0][5, 6, 64][1, 1, 1]
+      : tensor<5x6x64xf32> into tensor<5x6x128xf32>
+  return %inserted_slice_1, %res_2 : tensor<5x6x128xf32>, tensor<5x6x64xf32>
+}