llvm · hanhanW · Jun 27, 2025 · Jun 12, 2025 · Jun 26, 2025 · Jun 27, 2025
@@ -93,17 +93,20 @@ def Linalg_PackOp : Linalg_RelayoutOp<"pack", [
     tensor of rank `n + k` with a tiled and packed layout (maybe with padding)
     and optionally transposes the tiled source tensor dimensions.
 
-    `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are
-    being tiled, where `0 < k <= n`. The order of the dimensions matters:
-     - The tiled dimensions (of size `inner_tiles`) are added to the end of the result
-    tensor in the order in which they appear in `inner_dims_pos`.
-     - `inner_dims_pos[i]` specifies the source tensor dimension tiled by
-    `inner_tiles[i]`.
-
     `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes
     correspond to the least significant ("inner") result tensor dimension sizes,
     in the same order. Tile sizes can be static or dynamic.
 
+    `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are
+    being tiled, where `0 <= k <= n`.
-    being tiled, where `0 <= k <= n`.
+    being tiled, where `0 <= k < n`.
 func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { 
   %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> 
   %1 = tensor.empty() : tensor<128x256xf32> 
   %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> 
   return %2 : tensor<128x256xf32> 
-    being tiled, where `0 <= k <= n`.
+    being tiled, where `0 <= k < n`.
 func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { 
   %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> 
   %1 = tensor.empty() : tensor<128x256xf32> 
   %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> 
   return %2 : tensor<128x256xf32> 
+     - `inner_dims_pos[i]` specifies the source tensor dimension tiled by
+    `inner_tiles[i]` where `0 <= i < k`.
+     - The tiled dimensions (of size `inner_tiles`) are added to the end of the
+     result tensor in the order in which they appear, i.e.
+     `shape(result)[rank(result) + i] = inner_tiles[i]` for `0 <= i < k`.
+     - The following relationship for the tiled dimensions holds:
+     `shape(result)[inner_dims_pos[i]] = shape(source)[inner_dims_pos[i]] / inner_tiles[i]`.
+
     Example: If `inner_tiles = [16, 32]`, the result tensor has a shape of
     `...x16x32`. If `inner_dims_pos = [0, 1]`, the 0th source dimension is tiled
     by 16 and the 1st source dimension is tiled by 32. Other source dimensions
@@ -116,7 +119,19 @@ def Linalg_PackOp : Linalg_RelayoutOp<"pack", [
     %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32]
         into %dest : tensor<128x256xf32> -> tensor<16x8 x 8x32 xf32>
     //                                             \  /   \  /
-    //                                       outer dims  inner dims
+    //                                 Outer Dims: 16x8   Inner Dims: 8x32
+
+    // CHW to CHWhw
+    %0 = linalg.pack %source inner_dims_pos = [2, 1] inner_tiles = [4, 2]
+        into %dest : tensor<3x20x24xf32> -> tensor<3x10x6 x 4x2 xf32>
+    //                                              \  /    \ /
+    //                                 Outer Dims: 3x10x6  Inner Dims: 4x2
+
+    // HCW to HCWhw
+    %0 = linalg.pack %source inner_dims_pos = [2, 0] inner_tiles = [4, 2]
+        into %dest : tensor<18x3x32xf32> -> tensor<9x3x8 x 4x2 xf32>
+    //                                              \  /   \ /
+    //                                 Outer Dims: 9x3x8  Inner Dims: 4x2
     ```
 
     `outer_dims_perm` (optional) specifies a permutation for the outer
@@ -246,13 +261,6 @@ def Linalg_UnPackOp : Linalg_RelayoutOp<"unpack"> {
     The "unpack" operation converts a source tensor of rank `n` with a tiled and
     packed layout to a result tensor of rank `n - k`.
 
-    `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions with
-    which the last `k` source tensor dimensions are combined, where
-    `0 < k <= n/2`. Each `inner_dims_pos` element must be `>= 0` and `< n - k`.
-    The order of the dimensions in `inner_dims_pos` matters: dimension
-    `inner_dims_pos[i]` is combined with dimension `n - k + i` (assuming that
-    `outer_dims_perm` is not specified).
-
     `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes
     correspond to the least significant ("inner") source tensor dimension sizes.
     The behavior of this op is undefined if:
@@ -262,21 +270,50 @@ def Linalg_UnPackOp : Linalg_RelayoutOp<"unpack"> {
       `inner_dims_pos[i]` (assuming that `outer_dims_perm` is not specified)
       evenly.
 
+    `inner_dims_pos` (mandatory) specifies `k` result tensor (i.e. unpacked
+    tensor) dimensions that were tiled with the `inner_tiles` to create the
+    packed source tensor. The source tensor (i.e. packed tensor) dimensions can
+    be unpacked given `inner_dims_pos` as follows.
+    - For `0 <= i < k` the following relationship holds:
+    `shape(result)[inner_dims_pos[i]] = shape(source)[n-k+i] + shape(source)[inner_dims_pos[i]]`.
-    `shape(result)[inner_dims_pos[i]] = shape(source)[n-k+i] + shape(source)[inner_dims_pos[i]]`.
+    `shape(result)[inner_dims_pos[i]] <= shape(source)[n-k+i] * shape(source)[inner_dims_pos[i]]`.
-    `shape(result)[inner_dims_pos[i]] = shape(source)[n-k+i] + shape(source)[inner_dims_pos[i]]`.
+    `shape(result)[inner_dims_pos[i]] <= shape(source)[n-k+i] * shape(source)[inner_dims_pos[i]]`.
+    - For `0 <= j < n-k` and `j` not in `inner_dims_pos` the following relationship holds:
+    `shape(result)[j] = shape(source)[j]`.
+
     `outer_dims_perm` (optional) specifies a permutation for the outer
     dimensions. If specified, it must have `n - k` elements. If specified, this
     permutation is applied before combining any dimensions.
 
-    Example:
+    Note, the unpack operation may drop any padding introduced by the pack
+    operation and hence the following holds
+    `NumElementsOf(source) >= NumElementsOf(result)`.
+
+    Examples:
 
     ```mlir
     // NCnc to NC:
     %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32]
-        into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32>
+        into %dest : tensor<16x8 x 8x32 xf32> -> tensor<128x256xf32>
+    //                      \  /   \  /
+    //          Outer Dims: 16x8  Inner Dims: 8x32
 
     // CK to KCck:
     %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1]
-        inner_tiles = [8, 32] into %dest
-        : tensor<8x16x8x32xf32> -> tensor<128x256xf32>
+        inner_tiles = [8, 32]
+        into %dest : tensor<8x16 x 8x32 xf32> -> tensor<128x256xf32>
+    //                      \  /   \  /
+    //          Outer Dims: 8x16  Inner Dims: 8x32
+
+    // CHW to CHWhw:
+    %0 = linalg.unpack %source inner_dims_pos = [2, 1] inner_tiles = [4, 2]
+        into %dest : tensor<3x10x6 x 4x2 xf32> -> tensor<3x20x24xf32>
+    //                       \  /    \ /
+    //          Outer Dims: 3x10x6  Inner Dims: 4x2
+
+    // HCW to HCWhw
+    %0 = linalg.unpack %source inner_dims_pos = [2, 0] inner_tiles = [4, 2]
+        into %dest : tensor<9x3x8 x 4x2 xf32> -> tensor<18x3x32xf32>
+    //                       \  /   \ /
+    //          Outer Dims: 9x3x8   Inner Dims: 4x2
     ```
   }];
   let arguments = (ins AnyRankedTensor:$source,

diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -1824,6 +1824,17 @@ func.func @unpack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: t
 
 // -----
 
+// Here we have the source tensor being tiled as: `source[1] / 32` and `source[0] / 16` but the inner_dims_pos does not imply
+// a transpose of the outer dimensions for the result tensor. The tiled dimensions appear in the result tensor in the order
+// they appear in the source tensor, i.e. 16x4x32x16
+func.func @pack_invalid_result_shape(%input: tensor<256x128xf32>, %output: tensor<4x16x32x16xf32>) -> tensor<4x16x32x16xf32> {
+  // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<16x4x32x16xf32>', got 'tensor<4x16x32x16xf32>'}}
+  %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [32, 16] into %output : tensor<256x128xf32> -> tensor<4x16x32x16xf32>
+  return %0 : tensor<4x16x32x16xf32>
+}
+
+// -----
+
 func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> {
   // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x8x16x32xf32>', got 'tensor<8x8x32x16xf32>'}}
   %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32>

diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -2771,6 +2771,69 @@ func.func @pad_and_pack_partially_dynamic(%source: tensor<?x?xf32>, %dest: tenso
 
 // -----
 
+func.func @pack_descending_inner_dims_with_padding(%source: tensor<1x5x7xf32>, %dest: tensor<1x3x2x4x2xf32>, %pad: f32) -> tensor<1x3x2x4x2xf32> {
-func.func @pack_descending_inner_dims_with_padding(%source: tensor<1x5x7xf32>, %dest: tensor<1x3x2x4x2xf32>, %pad: f32) -> tensor<1x3x2x4x2xf32> {
+func.func @pack_transposed_inner_dims_with_padding(%source: tensor<1x5x7xf32>, %dest: tensor<1x3x2x4x2xf32>, %pad: f32) -> tensor<1x3x2x4x2xf32> {
-func.func @pack_descending_inner_dims_with_padding(%source: tensor<1x5x7xf32>, %dest: tensor<1x3x2x4x2xf32>, %pad: f32) -> tensor<1x3x2x4x2xf32> {
+func.func @pack_transposed_inner_dims_with_padding(%source: tensor<1x5x7xf32>, %dest: tensor<1x3x2x4x2xf32>, %pad: f32) -> tensor<1x3x2x4x2xf32> {
+  %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [4, 2] into %dest : tensor<1x5x7xf32> -> tensor<1x3x2x4x2xf32>
+  return %0 : tensor<1x3x2x4x2xf32>
+}
+
+// CHECK-LABEL: func.func @pack_descending_inner_dims_with_padding(
+// CHECK-SAME:  %[[SOURCE:.*]]: tensor<1x5x7xf32>,
+// CHECK-SAME:  %[[DEST:.*]]: tensor<1x3x2x4x2xf32>,
+// CHECK-SAME:  %[[PAD:.*]]: f32)
+// CHECK:       %{{.*}} = linalg.pack
+// CHECK-SAME:      inner_dims_pos = [2, 1]
+// CHECK-SAME:      inner_tiles = [4, 2]
+// CHECK-SAME:      into %[[DEST]] : tensor<1x5x7xf32> -> tensor<1x3x2x4x2xf32>
+
+// -----
+
+// The function suffix "with_padding" refers to the padding that was introduced by the pack operation. But here
+// we are dropping the padding. Creating a tensor with less elements than what we started with.
+func.func @unpack_descending_inner_dims_with_padding(%source: tensor<1x3x2x4x2xf32>, %dest: tensor<1x5x7xf32>) -> tensor<1x5x7xf32> {
+  %0 = linalg.unpack %source inner_dims_pos = [2, 1] inner_tiles = [4, 2] into %dest : tensor<1x3x2x4x2xf32> -> tensor<1x5x7xf32>
+  return %0 : tensor<1x5x7xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_descending_inner_dims_with_padding(
+// CHECK-SAME:  %[[SOURCE:.*]]: tensor<1x3x2x4x2xf32>,
+// CHECK-SAME:  %[[DEST:.*]]: tensor<1x5x7xf32>)
+// CHECK:       %{{.*}} = linalg.unpack
+// CHECK-SAME:      inner_dims_pos = [2, 1]
+// CHECK-SAME:      inner_tiles = [4, 2]
+// CHECK-SAME:      into %[[DEST]] : tensor<1x3x2x4x2xf32> -> tensor<1x5x7xf32>
+
+// -----
+
+func.func @pack_non_adjacent_inner_dims(%source: tensor<20x1x12xf32>, %dest: tensor<10x1x3x4x2xf32>) -> tensor<10x1x3x4x2xf32> {
+  %0 = linalg.pack %source inner_dims_pos = [2, 0] inner_tiles = [4, 2] into %dest : tensor<20x1x12xf32> -> tensor<10x1x3x4x2xf32>
+  return %0 : tensor<10x1x3x4x2xf32>
+}
+
+// CHECK-LABEL: func.func @pack_non_adjacent_inner_dims(
+// CHECK-SAME:  %[[SOURCE:.*]]: tensor<20x1x12xf32>,
+// CHECK-SAME:  %[[DEST:.*]]: tensor<10x1x3x4x2xf32>)
+// CHECK:       %{{.*}} = linalg.pack
+// CHECK-SAME:      inner_dims_pos = [2, 0]
+// CHECK-SAME:      inner_tiles = [4, 2]
+// CHECK-SAME:      into %[[DEST]] : tensor<20x1x12xf32> -> tensor<10x1x3x4x2xf32>
+
+// -----
+
+func.func @unpack_non_adjacent_inner_dims(%source: tensor<10x1x3x4x2xf32>, %dest: tensor<20x1x12xf32>) -> tensor<20x1x12xf32> {
+  %0 = linalg.unpack %source inner_dims_pos = [2, 0] inner_tiles = [4, 2] into %dest : tensor<10x1x3x4x2xf32> -> tensor<20x1x12xf32>
+  return %0 : tensor<20x1x12xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_non_adjacent_inner_dims(
+// CHECK-SAME:  %[[SOURCE:.*]]: tensor<10x1x3x4x2xf32>,
+// CHECK-SAME:  %[[DEST:.*]]: tensor<20x1x12xf32>)
+// CHECK:       %{{.*}} = linalg.unpack
+// CHECK-SAME:      inner_dims_pos = [2, 0]
+// CHECK-SAME:      inner_tiles = [4, 2]
+// CHECK-SAME:      into %[[DEST]] : tensor<10x1x3x4x2xf32> -> tensor<20x1x12xf32>
+
+// -----
+
 func.func @unpack_fully_dynamic(%source: tensor<?x?x?x?xf32>, %dest: tensor<?x?xf32>, %tile_n : index, %tile_m : index) -> tensor<?x?xf32> {
   %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor<?x?x?x?xf32> -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>