Addressing review feedbacks

jerryyin · jerryyin · commit 19c26c021908 · 2025-05-05T18:14:50.000Z
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -300,10 +300,11 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo,
 
 static bool isGenericOutsNotUsed(linalg::GenericOp genericOp) {
   int numDpsOuts = genericOp.getNumDpsInits();
+  Block *block = genericOp.getBody();
+  int numBlockArgs = block->getNumArguments();
+  int initArgStartIndex = numBlockArgs - numDpsOuts;
   for (int i = 0; i < numDpsOuts; ++i) {
-    Block *block = genericOp.getBody();
-    int numBlockArgs = block->getNumArguments();
-    int matchingInitArgIndex = numBlockArgs - numDpsOuts + i;
+    int matchingInitArgIndex = initArgStartIndex + i;
     return block->getArgument(matchingInitArgIndex).use_empty();
   }
   return true;
@@ -312,18 +313,13 @@ static bool isGenericOutsNotUsed(linalg::GenericOp genericOp) {
 /// Pack a genericOp and return it.
 static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
                                Value dest, AffineMap packedOutIndexingMap,
-                               const PackInfo &packInfo) {
+                               const PackInfo &packInfo,
+                               bool canUnpackPackFold) {
   Location loc = genericOp.getLoc();
   SmallVector<Value> inputOperands;
   SmallVector<Value> inputOperandsFromUnpackedSource;
   SmallVector<AffineMap> indexingMaps;
 
-  // Note: canUnpackPackFold needs to also guarantee the generic body
-  // doesn't have gather semantics. Since such scenarios has been
-  // rejected by both BubbleUpPackOpThroughGenericOp and
-  // PushDownUnPackOpThroughGenericOp, we can safely assume
-  // canUnpackPackFold is as long as init is not used.
-  bool canUnpackPackFold = isGenericOutsNotUsed(genericOp);
   for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) {
     auto [packedOperand, packedIndexingMap] = getOrCreatePackedViewOfOperand(
         rewriter, loc, packInfo, genericOp, inputOperand);
@@ -338,10 +334,18 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
     indexingMaps.push_back(packedIndexingMap);
   }
 
+  // Note: Whether or not the unpack pack sequence can fold also depends on
+  // the caller of this routine.
+  // 1) In push down unpack op pattern, this is true because the pack op is
+  // generated and we can guarantee they are compatible.
+  // 2) In bubble up pack op pattern, this is not true because the unpack op
+  // can be from an arbitrary domain so we need to keep both.
+  canUnpackPackFold = canUnpackPackFold && isGenericOutsNotUsed(genericOp) &&
+                      !hasGatherSemantics(genericOp);
   // If The pack and unpack op can be folded:
-  // 1) use unpack op source op for operand to fold unpack -> pack sequence
-  // 2) init tensor of the generic op can be replaced by the new tensor.empty
-  // as the generic out.
+  // 1) use unpack op source op for operand to fold unpack -> pack sequence.
+  // 2) init tensor of the generic op can be replaced by the destination of the
+  // pack op.
   if (canUnpackPackFold) {
     inputOperands = inputOperandsFromUnpackedSource;
     if (auto destPack = dest.getDefiningOp<linalg::PackOp>())
@@ -484,7 +488,7 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp,
     dest = packOpDest;
   }
   return packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap,
-                       *packInfo);
+                       *packInfo, /*canUnpackPackFold=*/false);
 }
 
 /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method.
@@ -1122,7 +1126,8 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp,
 
   // Pack the genericOp.
   GenericOp newGenericOp =
-      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo);
+      packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo,
+                    /*canUnpackPackFold=*/true);
   Value newResult =
       newGenericOp.getTiedOpResult(newGenericOp.getDpsInitOperand(0));
 
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -1398,24 +1398,45 @@ func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x
 
 // -----
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @fold_unpack_pack_after_bubble_up(%arg0: tensor<8x8x4x8xf32>) -> tensor<8x8x4x8xf32> {
-  %empty = tensor.empty() : tensor<32x64xf32>
-  %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %empty : tensor<8x8x4x8xf32> -> tensor<32x64xf32>
-  %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<32x64xf32>) outs(%empty : tensor<32x64xf32>) {
-  ^bb0(%in: f32, %out: f32):
-    %2 = arith.addf %in, %in : f32
-    linalg.yield %2 : f32
-  } -> tensor<32x64xf32>
-  %empty1 = tensor.empty() : tensor<8x8x4x8xf32>
-  %pack = linalg.pack %1 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %empty1 : tensor<32x64xf32> -> tensor<8x8x4x8xf32>
-  return %pack : tensor<8x8x4x8xf32>
+func.func @push_unpack_in_padded_domain_foldable(%arg0: tensor<8x8x4x8xf32>, %dest: tensor<?x64xf32>, %arg1: tensor<?x64xbf16>) -> tensor<?x64xbf16> {
+  %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %dest : tensor<8x8x4x8xf32> -> tensor<?x64xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<?x64xf32>) outs(%arg1 : tensor<?x64xbf16>) {
+  ^bb0(%in: f32, %out: bf16):
+    %1 = arith.truncf %in : f32 to bf16
+    linalg.yield %1 : bf16
+  } -> tensor<?x64xbf16>
+  return %0 : tensor<?x64xbf16>
 }
 
-// CHECK-LABEL: func.func @fold_unpack_pack_after_bubble_up
+// CHECK-LABEL: func.func @push_unpack_in_padded_domain_foldable
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<8x8x4x8xf32>
-// CHECK:         %[[GENERIC:.+]] = linalg.generic 
+// CHECK:         %[[EMPTY:.+]] = tensor.empty
+// CHECK:         %[[GENERIC:.+]] = linalg.generic
 // CHECK-SAME:    ins(%[[ARG0]] : tensor<8x8x4x8xf32>)
-// CHECK-SAME:    outs(%[[EMPTY]] : tensor<8x8x4x8xf32>)
-// CHECK:         return %[[GENERIC]] : tensor<8x8x4x8xf32>
+// CHECK-SAME:    outs(%[[EMPTY]] : tensor<?x8x4x8xbf16>)
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[GENERIC]]
+// CHECK:         return %[[UNPACK]] : tensor<?x64xbf16>
+
+// -----
+
+func.func @push_unpack_in_padded_domain_not_foldable(%arg0: tensor<8x8x4x8xf32>, %arg1: tensor<?x64xf32>) -> tensor<?x64xf32> {
+  %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %arg1 : tensor<8x8x4x8xf32> -> tensor<?x64xf32>
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%unpack : tensor<?x64xf32>) outs(%arg1 : tensor<?x64xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %1 = arith.addf %in, %out : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x64xf32>
+  return %0 : tensor<?x64xf32>
+}
+
+// CHECK-LABEL: func.func @push_unpack_in_padded_domain_not_foldable
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK:         %[[UNPACK:.+]] = linalg.unpack %[[ARG0]]
+// CHECK:         %[[PACK:.+]] = linalg.pack %[[ARG1]]
+// CHECK:         %[[UNPACK1:.+]] = linalg.pack %[[UNPACK]]
+// CHECK:         %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:    ins(%[[UNPACK1]] : tensor<?x8x4x8xf32>)
+// CHECK-SAME:    outs(%[[PACK]] : tensor<?x8x4x8xf32>)
+// CHECK:         %[[UNPACK2:.+]] = linalg.unpack %[[GENERIC]]
+// CHECK:         return %[[UNPACK2]] : tensor<?x64xf32>