address comments

hanhanW · hanhanW · commit 601f21a18ee1 · 2025-07-24T10:08:48.000-07:00
Signed-off-by: hanhanW &lt;hanhan0912@gmail.com&gt;
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h
@@ -90,11 +90,6 @@ Value createOrFoldDimOp(OpBuilder &b, Location loc, Value val, int64_t dim);
 OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value val,
                                int64_t dim);
 
-/// Returns the outer shape in the packed domain before applying the
-/// transposition.
-template <typename OpTy>
-SmallVector<int64_t> getPackedOuterShapeWithoutTransposition(OpTy packOrUnPack);
-
 } // namespace linalg
 } // namespace mlir
 
@@ -150,4 +145,17 @@ std::pair<int64_t, int64_t> getFmrFromWinogradConv2DFmr(WinogradConv2DFmr fmr);
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc"
 
+namespace mlir {
+namespace linalg {
+
+/// Returns the outer shape in the packed domain before applying the
+/// transposition.
+template <typename OpTy,
+          typename = std::enable_if_t<std::is_same_v<OpTy, linalg::PackOp> ||
+                                      std::is_same_v<OpTy, linalg::UnPackOp>>>
+SmallVector<int64_t> getPackedOuterShapeWithoutTransposition(OpTy packOrUnPack);
+
+} // namespace linalg
+} // namespace mlir
+
 #endif // MLIR_DIALECT_LINALG_IR_LINALG_H
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -4491,7 +4491,7 @@ Speculation::Speculatability ElementwiseOp::getSpeculatability() {
 // PackOp/UnPackOp Common
 //===----------------------------------------------------------------------===//
 
-template <typename OpTy>
+template <typename OpTy, typename>
 SmallVector<int64_t>
 getPackedOuterShapeWithoutTransposition(OpTy packOrUnPack) {
   RankedTensorType packedType = (std::is_same<OpTy, PackOp>::value)
@@ -5520,19 +5520,19 @@ bool UnPackOp::canFoldSliceOp(tensor::ExtractSliceOp sliceOp) {
   if (!areAllConstantIntValue(sliceOp.getMixedOffsets(), 0) ||
       !areAllConstantIntValue(sliceOp.getMixedStrides(), 1))
     return false;
-  RankedTensorType unpackedType = sliceOp.getResultType();
+  RankedTensorType unpackedTypeAfterFold = sliceOp.getResultType();
   SmallVector<int64_t> outerShapeWithoutTranspose =
       getPackedOuterShapeWithoutTransposition(*this);
   for (auto [pos, tileSize] :
        llvm::zip_equal(this->getInnerDimsPos(), this->getStaticInnerTiles())) {
-    if (unpackedType.isDynamicDim(pos))
+    if (unpackedTypeAfterFold.isDynamicDim(pos))
       return false;
     if (ShapedType::isDynamic(outerShapeWithoutTranspose[pos]))
       return false;
     if (ShapedType::isDynamic(tileSize))
       return false;
     int64_t paddingSize = outerShapeWithoutTranspose[pos] * tileSize -
-                          unpackedType.getDimSize(pos);
+                          unpackedTypeAfterFold.getDimSize(pos);
     if (paddingSize >= tileSize)
       return false;
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp
@@ -220,9 +220,11 @@ struct FoldPadWithPackOp : public OpRewritePattern<PackOp> {
       if (!isEqualConstantIntOrValue(paddingValue, constantPaddingValue))
         return failure();
 
-    // Folding is not allowed if it introduces artificial padding. It is not
-    // safe to fold the ops if any dynamic dimension or tile size is present,
-    // because we can not infer the padding size.
+    // Folding is not allowed if it were to introduce artificial padding.
+    // Folding is also disabled in the case of dynamic dimensions and/or tile
+    // sizes - that is because it would be impossible to compute the padding
+    // size and hence to establish whether "artificial" padding would be
+    // created.
     RankedTensorType unpackedType = packOp.getSourceType();
     SmallVector<int64_t> outerShapeWithoutTranspose =
         getPackedOuterShapeWithoutTransposition(packOp);
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1889,7 +1889,7 @@ func.func @fold_cast_unpack_dynamic_tile_size(
 // linalg.unpack + tensor.extract_slice
 //===----------------------------------------------------------------------===//
 
-func.func @fold_extract_slice_into_unpack(
+func.func @fold_extract_slice_into_unpack_slicing_trailing_dim(
     %src : tensor<28x2x1x16x16xf32>, %dest : tensor<28x28x15xf32>, %size : index
 ) -> tensor<28x28x10xf32> {
   %unpack = linalg.unpack %src
@@ -1901,7 +1901,7 @@ func.func @fold_extract_slice_into_unpack(
       [0, 0, 0] [28, 28, 10] [1, 1, 1] : tensor<28x28x15xf32> to tensor<28x28x10xf32>
   return %extracted_slice : tensor<28x28x10xf32>
 }
-// CHECK-LABEL: func @fold_extract_slice_into_unpack
+// CHECK-LABEL: func @fold_extract_slice_into_unpack_slicing_trailing_dim
 //  CHECK-SAME:     %[[SRC:[a-zA-Z0-9]+]]
 //  CHECK-SAME:     %[[DEST:[a-zA-Z0-9]+]]
 //  CHECK-SAME:     %[[SIZE:[a-zA-Z0-9]+]]
@@ -1913,6 +1913,51 @@ func.func @fold_extract_slice_into_unpack(
 
 // -----
 
+// The available dimension size is [17, 32], because CeilDiv(%d1, 16) == 2.
+
+
+func.func @fold_extract_slice_into_unpack_slicing_dim_1(
+    %src : tensor<28x2x1x16x16xf32>, %dest : tensor<28x28x15xf32>, %size : index
+) -> tensor<28x17x15xf32> {
+  %unpack = linalg.unpack %src
+      inner_dims_pos = [1, 2]
+      inner_tiles = [16, 16]
+      into %dest : tensor<28x2x1x16x16xf32> -> tensor<28x28x15xf32>
+  %extracted_slice = tensor.extract_slice %unpack
+      [0, 0, 0] [28, 17, 15] [1, 1, 1] : tensor<28x28x15xf32> to tensor<28x17x15xf32>
+  return %extracted_slice : tensor<28x17x15xf32>
+}
+// CHECK-LABEL: func @fold_extract_slice_into_unpack_slicing_dim_1(
+//  CHECK-SAME:     %[[SRC:[a-zA-Z0-9]+]]
+//  CHECK-SAME:     %[[DEST:[a-zA-Z0-9]+]]
+//  CHECK-SAME:     %[[SIZE:[a-zA-Z0-9]+]]
+//       CHECK:   %[[DEST_SLICE:.+]] = tensor.extract_slice %[[DEST]]
+//  CHECK-SAME:     [0, 0, 0] [28, 17, 15] [1, 1, 1]
+//       CHECK:   %[[UNPACK:.+]] = linalg.unpack %[[SRC]]
+//  CHECK-SAME:       into %[[DEST_SLICE]]
+//       CHECK:   return %[[UNPACK]]
+
+// -----
+
+// The available dimension size is [17, 32], because CeilDiv(%d1, 16) == 2.
+
+func.func @no_fold_extract_slice_into_unpack_artificial_padding(
+    %src : tensor<28x2x1x16x16xf32>, %dest : tensor<28x28x15xf32>, %size : index
+) -> tensor<28x16x15xf32> {
+  %unpack = linalg.unpack %src
+      inner_dims_pos = [1, 2]
+      inner_tiles = [16, 16]
+      into %dest : tensor<28x2x1x16x16xf32> -> tensor<28x28x15xf32>
+  %extracted_slice = tensor.extract_slice %unpack
+      [0, 0, 0] [28, 16, 15] [1, 1, 1] : tensor<28x28x15xf32> to tensor<28x16x15xf32>
+  return %extracted_slice : tensor<28x16x15xf32>
+}
+// CHECK-LABEL: func @no_fold_extract_slice_into_unpack_artificial_padding
+//       CHECK:   linalg.unpack
+//       CHECK:   tensor.extract_slice
+
+// -----
+
 func.func @no_fold_extract_slice_into_unpack_dynamic(
     %src : tensor<28x2x?x16x16xf32>, %dest : tensor<28x32x?xf32>, %size : index
 ) -> tensor<28x28x?xf32> {
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -69,39 +69,37 @@ func.func @nofold_unpack_slice_rank_reduced(%arg0 : tensor<?x?x8x4xf32>, %arg1 :
 
 // -----
 
-func.func @pad_pack(%src: tensor<16649x16xf32>) -> tensor<2082x1x8x32xf32> {
-  %c0 = arith.constant 0 : index
+func.func @fold_pad_pack(%src: tensor<9x16xf32>) -> tensor<2x1x8x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %padded = tensor.pad %src low[0, 0] high[7, 0] {
   ^bb0(%arg0: index, %arg1: index):
     tensor.yield %cst : f32
-  } : tensor<16649x16xf32> to tensor<16656x16xf32>
-  %empty = tensor.empty() : tensor<2082x1x8x32xf32>
+  } : tensor<9x16xf32> to tensor<16x16xf32>
+  %empty = tensor.empty() : tensor<2x1x8x32xf32>
   %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty
-      : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32>
-  return %pack : tensor<2082x1x8x32xf32>
+      : tensor<16x16xf32> -> tensor<2x1x8x32xf32>
+  return %pack : tensor<2x1x8x32xf32>
 }
-// CHECK-LABEL: func.func @pad_pack
+// CHECK-LABEL: func.func @fold_pad_pack
 // CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
 // CHECK:         %[[PAD_VAL:.+]] = arith.constant 0.000000e+00 : f32
-// CHECK:         %[[DEST:.+]] = tensor.empty() : tensor<2082x1x8x32xf32>
+// CHECK:         %[[DEST:.+]] = tensor.empty() : tensor<2x1x8x32xf32>
 // CHECK:         %[[PACK:.+]] = linalg.pack %[[SRC]]
 // CHECK-SAME:      padding_value(%[[PAD_VAL]] : f32)
 // CHECK-SAME:      inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %[[DEST]]
 
 // -----
 
-func.func @nofold_pad_pack_artificial_padding(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> {
-  %c0 = arith.constant 0 : index
+func.func @nofold_pad_pack_artificial_padding(%src: tensor<9x16xf32>) -> tensor<3x1x8x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
-  %padded = tensor.pad %src low[0, 0] high[15, 0] {
+  %padded = tensor.pad %src low[0, 0] high[8, 0] {
   ^bb0(%arg0: index, %arg1: index):
     tensor.yield %cst : f32
-  } : tensor<16641x16xf32> to tensor<16656x16xf32>
-  %empty = tensor.empty() : tensor<2082x1x8x32xf32>
+  } : tensor<9x16xf32> to tensor<17x16xf32>
+  %empty = tensor.empty() : tensor<3x1x8x32xf32>
   %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty
-      : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32>
-  return %pack : tensor<2082x1x8x32xf32>
+      : tensor<17x16xf32> -> tensor<3x1x8x32xf32>
+  return %pack : tensor<3x1x8x32xf32>
 }
 // CHECK-LABLE: func.func @nofold_pad_pack_artificial_padding(
 // CHECK:         tensor.pad
@@ -110,7 +108,6 @@ func.func @nofold_pad_pack_artificial_padding(%src: tensor<16641x16xf32>) -> ten
 // -----
 
 func.func @nofold_pad_pack_with_nofold_attribute(%src: tensor<16649x16xf32>) -> tensor<2082x1x8x32xf32> {
-  %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
   %padded = tensor.pad %src nofold low[0, 0] high[7, 0] {
   ^bb0(%arg0: index, %arg1: index):
@@ -128,7 +125,6 @@ func.func @nofold_pad_pack_with_nofold_attribute(%src: tensor<16649x16xf32>) ->
 // -----
 
 func.func @pad_pack_different_padding_value(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> {
-  %c0 = arith.constant 0 : index
   %cst0 = arith.constant 0.000000e+00 : f32
   %cst1 = arith.constant 1.000000e+00 : f32
   %padded = tensor.pad %src low[0, 0] high[15, 0] {