[mlir][linalg] fix DecomposeOuterUnitDimsPackOpPattern (#21)

chrsmcgrr · web-flow · commit fb2842edc4b9 · 2025-06-03T11:34:35.000+02:00
Given the following example:

```
func.func @pack_with_unit_outer_dims_and_non_adjacent_inner(%arg0: tensor&lt;3x1x4xf32&gt;, %arg1: tensor&lt;1x1x1x4x3xf32&gt;) -&gt; tensor&lt;1x1x1x4x3xf32&gt; {
  %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0]  inner_dims_pos = [2, 0] inner_tiles = [4, 3] into %arg1 : tensor&lt;3x1x4xf32&gt; -&gt; tensor&lt;1x1x1x4x3xf32&gt;
  return %pack : tensor&lt;1x1x1x4x3xf32&gt;
}
```

We would up until now creating an invalid transpose. That is because we would use the `getDimAndTileMapping()` function of the packOp which tranposes the tile dimensions to match based on the given `inner_dims_pos` value.

Here in the above example we have `inner_dims_pos` of `[2, 0]` meaning from the source tensors the indices 2 and 0 must be the `inner_tiles`. This property is not required for calculating the tile sizes as the destination tensor shape will be simply `[1x1x1x4x3]`. The inner dimensions positions are only required for calculating the tranpose. With this we can simplify the pattern.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1160,8 +1160,6 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
   Location loc = packOp.getLoc();
 
   Value input = getPackOpSourceOrPaddedSource(rewriter, packOp);
-  DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
-      packOp.getDimAndTileMapping();
   int64_t srcRank = packOp.getSourceRank();
   int64_t destRank = packOp.getDestRank();
   int64_t numTiles = destRank - srcRank;
@@ -1174,18 +1172,21 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
         packOp, "Attempting to tile non-trailing source dims!");
 
   // 1. Extract the inner tile sizes.
-  // Where possible, values are replaced with constant attributes (to match the
-  // behaviour of `getPackOpSourceOrPaddedSource`).
+  //    Use the tile sizes as defined in the operation. As all the outer
+  //    dimensions are 1 and by definition the last `k` dimensions of the
+  //    destination tensor (packed tensor) will be the tile sizes, we can simply
+  //    use the tiles for calculating our transpose permutations.
+  //
+  //    Where possible, values are replaced with constant attributes (to match
+  //    the behaviour of `getPackOpSourceOrPaddedSource`).
   SmallVector<OpFoldResult> tileSizes;
-  for (auto i : llvm::seq<unsigned>(0, srcRank)) {
-    if (dimAndTileMapping.count(i)) {
-      // Rather than taking the tile size as is, extact the actual constant
-      // value Attribute where possible, e.g.:
-      //    [Value: %tile_size = arith.constant 8 : index] --> [Attribute: 8]
-      auto [_, tileSize] =
-          getSimplifiedOfrAndStaticSizePair(dimAndTileMapping[i], rewriter);
-      tileSizes.push_back(tileSize);
-    }
+  for (const OpFoldResult &tileSizeDef : packOp.getMixedTiles()) {
+    // Rather than taking the tile size as is, extract the actual constant
+    // value Attribute where possible, e.g.:
+    //    [Value: %tile_size = arith.constant 8 : index] --> [Attribute: 8]
+    auto [_, tileSize] =
+        getSimplifiedOfrAndStaticSizePair(tileSizeDef, rewriter);
+    tileSizes.push_back(tileSize);
   }
 
   // 2. Transpose the input to match the inner tile order:
@@ -1218,9 +1219,6 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
   SmallVector<OpFoldResult> transShapeForEmptyOp(srcRank - numTiles,
                                                  oneIdxAttr);
   transShapeForEmptyOp.append(tileSizes);
-
-  applyPermutationToVector<OpFoldResult>(transShapeForEmptyOp,
-                                         srcPermForTranspose);
   Value empty = rewriter.create<tensor::EmptyOp>(
       loc, transShapeForEmptyOp, packOp.getSourceType().getElementType());
 
@@ -1233,8 +1231,7 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
   SmallVector<OpFoldResult> writeStrides(destRank, oneIdxAttr);
   SmallVector<OpFoldResult> writeOffsets(destRank, zeroIdxAttr);
   // Outer dims are all 1s!
-  SmallVector<OpFoldResult> writeSizes(destRank - dimAndTileMapping.size(),
-                                       oneIdxAttr);
+  SmallVector<OpFoldResult> writeSizes(destRank - numTiles, oneIdxAttr);
   SmallVector<int64_t> writeShape;
 
   for (auto tileSize : packOp.getMixedTiles()) {
diff --git a/mlir/test/Dialect/Linalg/decompose-pack.mlir b/mlir/test/Dialect/Linalg/decompose-pack.mlir
@@ -247,4 +247,22 @@ func.func @pack_with_unit_outer_dims_and_unit_inner(%arg0: tensor<1x1x4xf32>, %a
 // CHECK-SAME:      permutation = [1, 2, 0]
 // CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0, 0] [1, 1, 1, 4, 1] [1, 1, 1, 1, 1] : tensor<1x4x1xf32> into tensor<1x1x1x4x1xf32>
-// CHECK:         return %[[INSERT]]
+// CHECK:         return %[[INSERT]]
+
+// -----
+
+func.func @pack_with_unit_outer_dims_and_non_adjacent_inner(%arg0: tensor<4x1x3xf32>, %arg1: tensor<1x1x1x3x4xf32>) -> tensor<1x1x1x3x4xf32> {
+  %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0]  inner_dims_pos = [2, 0] inner_tiles = [3, 4] into %arg1 : tensor<4x1x3xf32> -> tensor<1x1x1x3x4xf32>
+  return %pack : tensor<1x1x1x3x4xf32>
+}
+// CHECK-LABEL: func.func @pack_with_unit_outer_dims_and_non_adjacent_inner
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x3x4xf32>
+// CHECK:         %[[TRANSP:.+]] = linalg.transpose
+// CHECK-SAME:      ins(%[[SRC]] : tensor<4x1x3xf32>)
+// CHECK-SAME:      outs(%[[EMPTY]] : tensor<1x3x4xf32>)
+// CHECK-SAME:      permutation = [1, 2, 0]
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
+// CHECK-SAME:      [0, 0, 0, 0, 0] [1, 1, 1, 3, 4] [1, 1, 1, 1, 1] : tensor<1x3x4xf32> into tensor<1x1x1x3x4xf32>
+// CHECK:         return %[[INSERT]]
diff --git a/mlir/test/Dialect/Linalg/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-unpack.mlir
@@ -169,3 +169,20 @@ func.func @unpack_with_dynamic_dims(%arg0: tensor<?x1x1x1x8x32xf32>, %arg1: tens
 // CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[EXTRACT_SLICE]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0] [%[[DIM0_DEST]], 1, 32, 8] [1, 1, 1, 1]
 // CHECK:         return %[[INSERT]]
+
+// -----
+
+func.func @unpack_with_unit_outer_dims_and_non_adjacent_inner(%arg0: tensor<1x1x1x3x4xf32>, %arg1: tensor<4x1x3xf32>) -> tensor<4x1x3xf32> {
+  %pack = linalg.unpack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [3, 4] into %arg1 : tensor<1x1x1x3x4xf32> -> tensor<4x1x3xf32>
+  return %pack : tensor<4x1x3xf32>
+}
+// CHECK-LABEL: func.func @unpack_with_unit_outer_dims_and_non_adjacent_inner
+// CHECK-SAME:     %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[DEST:[a-zA-Z0-9]+]]
+// CHECK:        %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][0, 0, 0, 0, 0] [1, 1, 1, 3, 4] [1, 1, 1, 1, 1] : tensor<1x1x1x3x4xf32> to tensor<3x4xf32>
+// CHECK:        %[[EMPTY:.+]] = tensor.empty() : tensor<4x3xf32>
+// CHECK:        %[[TRANSP:.+]] = linalg.transpose
+// CHECK-SAME:                      ins(%[[SLICE]] : tensor<3x4xf32>)
+// CHECK-SAME:                      outs(%[[EMPTY]] : tensor<4x3xf32>) permutation = [1, 0]
+// CHECK:        %[[INSERT:.+]] = tensor.insert_slice %transposed into %[[DEST]][0, 0, 0] [4, 1, 3] [1, 1, 1] : tensor<4x3xf32> into tensor<4x1x3xf32>
+// CHECK:        return %[[INSERT]]