[mlir] Add optimization to bubbleUpPadSlice pattern for no pad case (#135859)

nirvedhmeshram · web-flow · commit 23020a8d01a3 · 2025-04-18T12:21:37.000-05:00
In cases where there is no padding on a dim, we do not need to compute
new offsets, lengths and padding, for example the new test case added
can just be lowered to
```
    %extracted_slice = tensor.extract_slice %arg0[%arg2, 1, 2] [%arg2, 2, 1] [1, 1, 1] : tensor&lt;3x4x5xf32&gt; to tensor&lt;?x2x1xf32&gt;
 ```
without this PR we will have affine maps like
 ```
#map = affine_map&lt;()[s0] -&gt; (3, s0)&gt;
#map1 = affine_map&lt;()[s0, s1] -&gt; (-s0 + 3, s1)&gt;
%0 = affine.min #map()[%arg2]
 %1 = affine.min #map1()[%0, %arg2]
 %extracted_slice = tensor.extract_slice %arg0[%0, 1, 2] [%1, 2, 1] [1, 1, 1] : tensor&lt;3x4x5xf32&gt; to tensor&lt;?x2x1xf32&gt;
 ```
  which are unnecessary

Signed-off-by: Nirvedh &lt;nirvedh@gmail.com&gt;
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -122,7 +122,7 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
   OpFoldResult zero = b.getIndexAttr(0);
 
   // Compute new offsets, lengths, low padding, high padding.
-  SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+  SmallVector<OpFoldResult> newOffsets, newLengths;
   SmallVector<OpFoldResult> newLows, newHighs;
   // Set to true if the original data source is not read at all.
   bool hasZeroLen = false;
@@ -131,13 +131,25 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
   Value dynHasZeroLenCond;
 
   int64_t rank = padOp.getSourceType().getRank();
+  // Only unit stride supported.
+  SmallVector<OpFoldResult> newStrides(rank, b.getIndexAttr(1));
   for (unsigned dim = 0; dim < rank; ++dim) {
     auto low = padOp.getMixedLowPad()[dim];
     bool hasLowPad = !isConstantIntValue(low, 0);
     auto high = padOp.getMixedHighPad()[dim];
     bool hasHighPad = !isConstantIntValue(high, 0);
     auto offset = offsets[dim];
     auto length = sizes[dim];
+    // If the dim has no padding, we dont need to calculate new values for that
+    // dim as the exisiting ones are correct even after the pattern.
+    if (!hasLowPad && !hasHighPad) {
+      newOffsets.push_back(offset);
+      newLengths.push_back(length);
+      newLows.push_back(low);
+      newHighs.push_back(high);
+      continue;
+    }
+
     auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim);
 
     // The new amount of low padding is `low - offset`. Except for the case
@@ -216,9 +228,6 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
     OpFoldResult newHigh =
         hasHighPad ? sub(sub(length, newLength), newLow) : zero;
     newHighs.push_back(newHigh);
-
-    // Only unit stride supported.
-    newStrides.push_back(b.getIndexAttr(1));
   }
 
   // The shape of the result can be obtained from the sizes passed in.
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -7,17 +7,17 @@
 //       CHECK:     scf.for
 //       CHECK:       memref.alloc() : memref<128x16xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//       CHECK:         vector.constant_mask [16, 4] : vector<128x4xi1>
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       memref.alloc() : memref<16x128xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//       CHECK:         vector.constant_mask [16, 4] : vector<128x4xi1>
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       memref.alloc() : memref<128x128xf32, 3>
 //       CHECK:       scf.forall
-//       CHECK:         vector.create_mask
+//   CHECK-NOT:         mask
 //       CHECK:         vector.transfer_read
 //       CHECK:         vector.transfer_write
 //       CHECK:       linalg.matmul
diff --git a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
@@ -216,3 +216,20 @@ func.func @dynamic_zero_high_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
   %1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
+
+// -----
+// CHECK-LABEL: @nopaddim_with_dynamic_extract(
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<3x4x5xf32>
+//  CHECK-SAME:     %[[ARG1:.*]]: f32
+//  CHECK-SAME:     %[[ARG2:.*]]: index
+//       CHECK:   %[[RESULT:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 1, 2] [%[[ARG2]], 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
+//       CHECK:   return %[[RESULT]]
+func.func @nopaddim_with_dynamic_extract(%arg0 : tensor<3x4x5xf32>, %pad : f32, %index : index)
+    -> tensor<?x2x1xf32> {
+  %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 8] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index):
+      tensor.yield %pad : f32
+    } : tensor<3x4x5xf32> to tensor<3x11x13xf32>
+  %1 = tensor.extract_slice %0[%index, 1, 2] [%index, 2, 1] [1, 1, 1] : tensor<3x11x13xf32> to tensor<?x2x1xf32>
+  return %1 : tensor<?x2x1xf32>
+}