Extend concat -> slice canonicalization to remove concat inputs if possible

jorickert · jorickert · commit eb3aed527f08 · 2025-03-12T15:25:46.000Z
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -653,35 +653,47 @@ struct ConcatSliceOptimization : public OpRewritePattern<tosa::SliceOp> {
 
     llvm::SmallVector<int64_t> sliceStart(sliceOp.getStart());
     llvm::ArrayRef<int64_t> sliceSize = sliceOp.getSize();
-
-    // Validate slice on the concatenated axis. Slicing along this
-    // axis should span only one of the inputs to the concatenate
-    // operation.
-    std::optional<Value> replaceWithSlice;
+    llvm::SmallVector<Value> requiredConcatInputs;
+    int64_t processedOriginalConcatInputSize = 0;
+    int64_t droppedConcatInputSize = 0;
     for (auto input : inputs) {
-      auto inputType = dyn_cast<RankedTensorType>(input.getType());
+      const auto inputType = dyn_cast<RankedTensorType>(input.getType());
       if (!inputType || !inputType.hasStaticShape())
         return rewriter.notifyMatchFailure(
             sliceOp, "concat input must be a static ranked tensor");
-
-      if (sliceStart[axis] >= 0 &&
-          (sliceStart[axis] + sliceSize[axis]) <= inputType.getDimSize(axis)) {
-        replaceWithSlice = rewriter
-                               .create<tosa::SliceOp>(
-                                   sliceOp.getLoc(), sliceOp.getType(), input,
-                                   rewriter.getDenseI64ArrayAttr(sliceStart),
-                                   rewriter.getDenseI64ArrayAttr(sliceSize))
-                               .getResult();
-        break;
+      if (processedOriginalConcatInputSize <
+              (sliceStart[axis] + sliceSize[axis]) &&
+          (processedOriginalConcatInputSize + inputType.getDimSize(axis)) >
+              sliceStart[axis]) {
+        if (requiredConcatInputs.empty()) {
+          droppedConcatInputSize = processedOriginalConcatInputSize;
+        }
+        requiredConcatInputs.push_back(input);
       }
-      sliceStart[axis] -= inputType.getDimSize(axis);
+      processedOriginalConcatInputSize += inputType.getDimSize(axis);
     }
-
-    if (!replaceWithSlice)
+    if (requiredConcatInputs.size() == concatOp->getNumOperands()) {
       return rewriter.notifyMatchFailure(
-          sliceOp, "corresponding concat input not found for slice");
-
-    rewriter.replaceOp(sliceOp, replaceWithSlice.value());
+          sliceOp, "Could not reduce number of inputs to preceding concat");
+    }
+    if (requiredConcatInputs.size() != 1 && !concatOp->hasOneUse()) {
+      return rewriter.notifyMatchFailure(
+          sliceOp,
+          "Preceding concat must have a single use"); // Do not introduce new
+                                                      // concats
+    }
+    if (requiredConcatInputs.empty()) {
+      return rewriter.notifyMatchFailure(
+          sliceOp, "degenerate slice with zero sized dim in output");
+    }
+    sliceStart[axis] -= droppedConcatInputSize;
+    auto newConcat = rewriter.create<tosa::ConcatOp>(concatOp->getLoc(),
+                                                     requiredConcatInputs, axis);
+    auto newSlice = rewriter.create<tosa::SliceOp>(
+        sliceOp->getLoc(), sliceOp.getType(), newConcat,
+        rewriter.getDenseI64ArrayAttr(sliceStart),
+        rewriter.getDenseI64ArrayAttr(sliceSize));
+    rewriter.replaceOp(sliceOp, newSlice);
     return success();
   }
 };
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -829,6 +829,98 @@ func.func @canonicalize_cross_concat_inputs(%arg0 : tensor<1x12x12xf32>, %arg1 :
 
 // -----
 
+// CHECK-LABEL:  func.func @canonicalize_concat_slice_partial_concat_start_overlap
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_1_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_2_:%.+]]: tensor<1x12x12x2xf32>) -> tensor<1x12x12x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = tosa.concat [[PARAM_0_]], [[PARAM_1_]] {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x4xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x4xf32>) -> tensor<1x12x12x2xf32>
+// CHECK:           return [[VAR_1_]] : tensor<1x12x12x2xf32>
+func.func @canonicalize_concat_slice_partial_concat_start_overlap(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<1x12x12x2xf32>, %arg2 : tensor<1x12x12x2xf32>) -> tensor<1x12x12x2xf32> {
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x2xf32>
+  return %1 : tensor<1x12x12x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_concat_slice_partial_concat_end_overlap
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_1_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_2_:%.+]]: tensor<1x12x12x2xf32>) -> tensor<1x12x12x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = tosa.concat [[PARAM_1_]], [[PARAM_2_]] {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x4xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x4xf32>) -> tensor<1x12x12x2xf32>
+// CHECK:           return [[VAR_1_]] : tensor<1x12x12x2xf32>
+func.func @canonicalize_concat_slice_partial_concat_end_overlap(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<1x12x12x2xf32>, %arg2 : tensor<1x12x12x2xf32>) -> tensor<1x12x12x2xf32> {
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 3>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x2xf32>
+  return %1 : tensor<1x12x12x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_concat_slice_partial_concat_all_overlap
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_1_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_2_:%.+]]: tensor<1x12x12x2xf32>) -> tensor<1x12x12x4xf32> {
+// CHECK:           [[VAR_0_:%.+]] = tosa.concat [[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]] {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 4>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x4xf32>
+// CHECK:           return [[VAR_1_]] : tensor<1x12x12x4xf32>
+func.func @canonicalize_concat_slice_partial_concat_all_overlap(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<1x12x12x2xf32>, %arg2 : tensor<1x12x12x2xf32>) -> tensor<1x12x12x4xf32> {
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 4>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x4xf32>
+  return %1 : tensor<1x12x12x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_concat_slice_partial_concat_multi_use
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_1_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_2_:%.+]]: tensor<1x12x12x2xf32>) -> (tensor<1x12x12x6xf32>, tensor<1x12x12x2xf32>) {
+// CHECK:           [[VAR_0_:%.+]] = tosa.concat [[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]] {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x2xf32>
+// CHECK:           return [[VAR_0_]], [[VAR_1_]] : tensor<1x12x12x6xf32>, tensor<1x12x12x2xf32>
+func.func @canonicalize_concat_slice_partial_concat_multi_use(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<1x12x12x2xf32>, %arg2 : tensor<1x12x12x2xf32>) -> (tensor<1x12x12x6xf32>, tensor<1x12x12x2xf32>) {
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 2>, start = array<i64: 0, 0, 0, 1>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x2xf32>
+  return %0, %1 : tensor<1x12x12x6xf32>, tensor<1x12x12x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_concat_slice_zero_dim
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_1_:%.+]]: tensor<1x12x12x2xf32>, [[PARAM_2_:%.+]]: tensor<1x12x12x2xf32>) -> tensor<1x12x12x0xf32> {
+// CHECK:           [[VAR_0_:%.+]] = tosa.concat [[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]] {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 0>, start = array<i64: 0, 0, 0, 0>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x0xf32>
+// CHECK:           return [[VAR_1_]] : tensor<1x12x12x0xf32>
+// CHECK:         }
+func.func @canonicalize_concat_slice_zero_dim(%arg0 : tensor<1x12x12x2xf32>, %arg1 : tensor<1x12x12x2xf32>, %arg2 : tensor<1x12x12x2xf32>) -> tensor<1x12x12x0xf32> {
+  %0 = tosa.concat %arg0, %arg1, %arg2 {axis = 3 : i32} : (tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>, tensor<1x12x12x2xf32>) -> tensor<1x12x12x6xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 0>, start = array<i64: 0, 0, 0, 0>} : (tensor<1x12x12x6xf32>) -> tensor<1x12x12x0xf32>
+  return %1 : tensor<1x12x12x0xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_tile_slice
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x10x10xf32>) -> tensor<1x120x12x10x16xf32> {
+// CHECK:           [[VAR_0_:%.+]] = tosa.tile [[PARAM_0_]] {multiples = array<i64: 1, 10, 2, 2, 3>} : (tensor<1x12x12x10x10xf32>) -> tensor<1x120x24x20x30xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 120, 12, 10, 16>, start = array<i64: 0, 0, 1, 1, 8>} : (tensor<1x120x24x20x30xf32>) -> tensor<1x120x12x10x16xf32>
+// CHECK:           return [[VAR_1_]] : tensor<1x120x12x10x16xf32>
+func.func @canonicalize_tile_slice(%arg0 : tensor<1x12x12x10x10xf32>) -> tensor<1x120x12x10x16xf32> {
+  %0 = tosa.tile %arg0 {multiples = array<i64: 10, 10, 10, 10, 10>} : (tensor<1x12x12x10x10xf32>) -> tensor<10x120x120x100x100xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 120, 12, 10, 16>, start = array<i64: 0, 0, 1, 1, 18>} : (tensor<10x120x120x100x100xf32>) -> tensor<1x120x12x10x16xf32>
+  return  %1 :  tensor<1x120x12x10x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL:  func.func @canonicalize_tile_slice_multi_output
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x12x12x10x10xf32>) -> (tensor<10x120x120x100x100xf32>, tensor<1x12x12x10x16xf32>) {
+// CHECK:           [[VAR_0_:%.+]] = tosa.tile [[PARAM_0_]] {multiples = array<i64: 10, 10, 10, 10, 10>} : (tensor<1x12x12x10x10xf32>) -> tensor<10x120x120x100x100xf32>
+// CHECK:           [[VAR_1_:%.+]] = tosa.slice [[VAR_0_]] {size = array<i64: 1, 12, 12, 10, 16>, start = array<i64: 0, 0, 1, 1, 18>} : (tensor<10x120x120x100x100xf32>) -> tensor<1x12x12x10x16xf32>
+// CHECK:           return [[VAR_0_]], [[VAR_1_]] : tensor<10x120x120x100x100xf32>, tensor<1x12x12x10x16xf32>
+func.func @canonicalize_tile_slice_multi_output(%arg0 : tensor<1x12x12x10x10xf32>) -> (tensor<10x120x120x100x100xf32>, tensor<1x12x12x10x16xf32>) {
+  %0 = tosa.tile %arg0 {multiples = array<i64: 10, 10, 10, 10, 10>} : (tensor<1x12x12x10x10xf32>) -> tensor<10x120x120x100x100xf32>
+  %1 = tosa.slice %0 {size = array<i64: 1, 12, 12, 10, 16>, start = array<i64: 0, 0, 1, 1, 18>} : (tensor<10x120x120x100x100xf32>) -> tensor<1x12x12x10x16xf32>
+  return  %0, %1 :  tensor<10x120x120x100x100xf32>, tensor<1x12x12x10x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @canonicalize_optimize_sqrt_reciprocal
 func.func @canonicalize_optimize_sqrt_reciprocal(%arg0: tensor<1x5x1x1xf32>) -> tensor<1x5x1x1xf32> {
   // CHECK: %[[RSQRT:.*]] = tosa.rsqrt %arg{{.*}} : (tensor<1x5x1x1xf32>) -> tensor<1x5x1x1xf32>