Skip to content

Commit f6cff33

Browse files
committed
[mlir][linalg] Extend DecomposeOuterUnitDimsPackOpPattern (linalg.pack)
Similarly to #152960, this PR fixes `getTiledOuterDims` for `linalg.pack` by ensuring that the `outer_dims_perm` attributeis properly taken into account. This enables the main change in this PR: relaxing the constraints in * `DecomposeOuterUnitDimsPackOpPattern`. Specifically, the pattern is extended to allow non-unit untiled outer dimensions. This makes it consistent with the corresponding pattern for `linalg.unpack`: * `DecomposeOuterUnitDimsUnPackOpPattern`. One notable assumption remains: untiled outer dimensions are not permuted. This was already the case but is now explicitly documented.
1 parent f53b624 commit f6cff33

File tree

5 files changed

+91
-21
lines changed

5 files changed

+91
-21
lines changed

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1651,7 +1651,10 @@ struct DecomposePadOpPattern : public OpRewritePattern<tensor::PadOp> {
16511651
/// * tensor::PadOp + linalg::TransposeOp + tensor::EmptyOp +
16521652
/// tensor::InsertSliceOp ops.
16531653
///
1654-
/// Requires that all the outer dims of the input linalg::PackOp are 1.
1654+
/// Requires that all the tile outer dims of the input linalg::PackOp are 1.
1655+
/// Note that this constraint means to effectively one tile is packed.
1656+
///
1657+
/// In addition, assumes that the un-tiled outer dims are not permuted.
16551658
///
16561659
/// Before:
16571660
/// ```
@@ -1691,6 +1694,7 @@ struct DecomposeOuterUnitDimsPackOpPattern
16911694
/// * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp
16921695
///
16931696
/// Requires that all the tiled outer dims of the input linalg::PackOp are 1.
1697+
/// Note that this constraint means to effectively one tile is unpacked.
16941698
///
16951699
/// Before:
16961700
/// ```

mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5272,11 +5272,18 @@ ArrayRef<int64_t> PackOp::getAllOuterDims() {
52725272

52735273
SmallVector<int64_t> PackOp::getTiledOuterDims() {
52745274
auto innerDimsPos = getInnerDimsPos();
5275-
auto packedShape = getDestType().getShape();
5275+
SmallVector<int64_t> outerDims(getAllOuterDims());
52765276
SmallVector<int64_t> res;
52775277

5278+
// Recover the original order of the outer dims.
5279+
SmallVector<int64_t> outerDimPermInv(getOuterDimsPerm());
5280+
invertPermutationVector(outerDimPermInv);
5281+
if (!outerDimPermInv.empty())
5282+
applyPermutationToVector(outerDims, outerDimPermInv);
5283+
5284+
// Collect the outer dims corresponding to the tilled inner dims.
52785285
for (auto index : innerDimsPos)
5279-
res.push_back(packedShape[index]);
5286+
res.push_back(outerDims[index]);
52805287

52815288
return res;
52825289
}

mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,9 +1134,7 @@ getPackUnpackRankReducedPerm(ArrayRef<int64_t> shape,
11341134

11351135
LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
11361136
linalg::PackOp packOp, PatternRewriter &rewriter) const {
1137-
// TODO: support the case that outer dimensions are not all 1s. A
1138-
// tensor.expand_shape will be generated in this case.
1139-
if (llvm::any_of(packOp.getAllOuterDims(),
1137+
if (llvm::any_of(packOp.getTiledOuterDims(),
11401138
[](int64_t dim) { return dim != 1; })) {
11411139
return rewriter.notifyMatchFailure(
11421140
packOp, "not all outer dimensions of the result are 1s");
@@ -1149,7 +1147,6 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
11491147
int64_t srcRank = packOp.getSourceRank();
11501148
int64_t destRank = packOp.getDestRank();
11511149
ArrayRef<int64_t> innerDimsPos = packOp.getInnerDimsPos();
1152-
int64_t numberOfTiles = innerDimsPos.size();
11531150

11541151
// 1. Get the input that is going to be packed. If the input requires padding,
11551152
// add a padding operation and return that as the input.
@@ -1160,10 +1157,14 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
11601157
// %transposed_tile = linalg.transpose ins(%source_or_padded_source),
11611158
// outs(%init)
11621159
// Assumptions made:
1163-
// - All outer dims are 1 - the corresponding transposition order doesn't
1164-
// matter, but requires all dim indices to be present.
1165-
1166-
// 2.1 Get the permutation for linalg.transpose
1160+
// - All tiled outer dims are 1 - the corresponding transposition order
1161+
// doesn't matter, but requires all dim indices to be present.
1162+
// - Un-tiled outer dims remain un-permuted. (TODO: Fail when this does not
1163+
// hold)
1164+
1165+
// 2.1 Get the permutation for linalg.transpose:
1166+
// [ untiled-dims, inner-dims-pos ]
1167+
// Note, this logic assumes that the untiled dims are not permuted.
11671168
SmallVector<int64_t> srcPermForTranspose;
11681169
for (int64_t i = 0; i < srcRank; i++) {
11691170
// We assume the `k` dimensions of the inner dim position, where `k` is the
@@ -1179,9 +1180,19 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
11791180
}
11801181
srcPermForTranspose.append(innerDimsPos.begin(), innerDimsPos.end());
11811182

1182-
// 2.2 Create the init tensor for linalg.transpose with the correct shape
1183-
SmallVector<OpFoldResult> shapeForEmptyOp(srcRank - numberOfTiles,
1184-
oneIdxAttr);
1183+
// 2.2 Create the init tensor for linalg.transpose with the correct shape:
1184+
// [ untiled-dims, tiled-dims ]
1185+
ShapedType inputTy = cast<ShapedType>(input.getType());
1186+
SmallVector<OpFoldResult> shapeForEmptyOp;
1187+
for (int64_t i = 0; i < srcRank; i++) {
1188+
if (llvm::is_contained(innerDimsPos, i))
1189+
continue;
1190+
if (inputTy.isStaticDim(i))
1191+
shapeForEmptyOp.push_back(rewriter.getIndexAttr(inputTy.getShape()[i]));
1192+
else
1193+
shapeForEmptyOp.emplace_back(
1194+
tensor::DimOp::create(rewriter, loc, input, i).getResult());
1195+
}
11851196
shapeForEmptyOp.append(packOp.getMixedTiles());
11861197

11871198
// getMixedTiles() may contain Values pointing to constant ops, not the
@@ -1206,23 +1217,34 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
12061217

12071218
// 3. Insert the inner tile to the destination:
12081219
// %inserted_tile = tensor.insert_slice(%transposed_tile)
1209-
SmallVector<OpFoldResult> writeStrides(destRank, oneIdxAttr);
1210-
SmallVector<OpFoldResult> writeOffsets(destRank, zeroIdxAttr);
1211-
// Outer dims are all 1s!
1212-
SmallVector<OpFoldResult> writeSizes(destRank - numberOfTiles, oneIdxAttr);
1213-
SmallVector<int64_t> writeShape;
1220+
1221+
// Compute the sizes attribute:
1222+
// [ outer-dims, tile-sizes ]
1223+
// Note that the output from the transpose Op excludes the tiled outer dims.
1224+
// Given the assumptions (all tiled outer dims == 1), we can safely use a
1225+
// rank-expanding tensor.insert_slice. Rather than manually computing where to
1226+
// insert new unit dims (resulting from the expansion), use the Pack op
1227+
// attributes.
1228+
SmallVector<OpFoldResult> writeSizes;
1229+
for (auto size : packOp.getAllOuterDims()) {
1230+
writeSizes.push_back(rewriter.getIndexAttr(size));
1231+
}
12141232

12151233
for (auto tileSize : packOp.getMixedTiles()) {
12161234
auto [tileSizeStatic, tileSizeOfr] =
12171235
getSimplifiedOfrAndStaticSizePair(tileSize, rewriter);
12181236
writeSizes.push_back(tileSizeOfr);
1219-
writeShape.push_back(tileSizeStatic);
12201237
}
12211238

1222-
// 4. Replace tensor.packOp with tensor.insert_slice created above
1239+
SmallVector<OpFoldResult> writeStrides(destRank, oneIdxAttr);
1240+
SmallVector<OpFoldResult> writeOffsets(destRank, zeroIdxAttr);
1241+
1242+
// TODO: A constructor that doesn't require strised nor offsets.
12231243
auto insert = tensor::InsertSliceOp::create(
12241244
rewriter, loc, transposedOp.getResult()[0], packOp.getDest(),
12251245
writeOffsets, writeSizes, writeStrides);
1246+
1247+
// 4. Replace tensor.packOp with tensor.insert_slice created above
12261248
rewriter.replaceOp(packOp, insert.getResult());
12271249

12281250
return success();

mlir/lib/Dialect/Tensor/IR/TensorOps.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,6 +2310,7 @@ RankedTensorType ExtractSliceOp::inferResultType(
23102310
sourceTensorType.getEncoding());
23112311
}
23122312

2313+
// TODO: This uses neither offsets nor strides!
23132314
RankedTensorType ExtractSliceOp::inferResultType(
23142315
RankedTensorType sourceTensorType, ArrayRef<OpFoldResult> offsets,
23152316
ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides) {

mlir/test/Dialect/Linalg/decompose-pack.mlir

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,25 @@ func.func @simple_KCRS_to_KCRSsr(%arg0: tensor<?x?xi32>, %arg1: tensor<1x1x?x1xi
3131

3232
// -----
3333

34+
func.func @NCHW_to_NCHWc(%src: tensor<2x32x16x8xf32>, %dest: tensor<2x1x16x8x32xf32>) -> tensor<2x1x16x8x32xf32> {
35+
%pack = linalg.pack %src
36+
inner_dims_pos = [1]
37+
inner_tiles = [32] into %dest
38+
: tensor<2x32x16x8xf32> -> tensor<2x1x16x8x32xf32>
39+
return %pack : tensor<2x1x16x8x32xf32>
40+
}
41+
// CHECK-LABEL: func.func @NCHW_to_NCHWc(
42+
// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]
43+
// CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]]
44+
// CHECK: %[[INIT:.*]] = tensor.empty() : tensor<2x16x8x32xf32>
45+
// CHECK: %[[TR:.*]] = linalg.transpose ins(%[[SRC]] : tensor<2x32x16x8xf32>) outs(%[[INIT]] : tensor<2x16x8x32xf32>) permutation = [0, 2, 3, 1]
46+
// CHECK: %[[RES:.*]] = tensor.insert_slice %[[TR]] into %[[DEST]]
47+
// CHECK-SAME: [0, 0, 0, 0, 0] [2, 1, 16, 8, 32] [1, 1, 1, 1, 1]
48+
// CHECK-SAME: : tensor<2x16x8x32xf32> into tensor<2x1x16x8x32xf32>
49+
// CHECK: return %[[RES]] : tensor<2x1x16x8x32xf32>
50+
51+
// -----
52+
3453
func.func @simple_pad_and_pack_static_tiles(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2xf32>, %pad: f32) -> tensor<1x1x8x2xf32> {
3554
%0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32>
3655
return %0 : tensor<1x1x8x2xf32>
@@ -295,3 +314,20 @@ func.func @pack_with_non_adjacent_and_non_permuted_inner_dims(%arg0: tensor<8x1x
295314
// CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
296315
// CHECK-SAME: [0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 8, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<1x1x1x1x8x1xf32>
297316
// CHECK: return %[[INSERT]]
317+
318+
// -----
319+
/// Note "126", which is a non-unit tile-outer-dim. This is not supported.
320+
321+
func.func @negative_non_unit_tiled_outer_dim(%dest: tensor<1x126x1x1x8xf32>, %src: tensor<1x1x1x1001xf32>, %pad: f32) -> tensor<1x126x1x1x8xf32> {
322+
%pack = linalg.pack %src
323+
padding_value(%pad : f32)
324+
outer_dims_perm = [0, 3, 2, 1]
325+
inner_dims_pos = [3]
326+
inner_tiles = [8]
327+
into %dest : tensor<1x1x1x1001xf32>
328+
-> tensor<1x126x1x1x8xf32>
329+
330+
return %pack : tensor<1x126x1x1x8xf32>
331+
}
332+
// CHECK-LABEL: @negative_non_unit_tiled_outer_dim(
333+
// CHECK: linalg.pack

0 commit comments

Comments
 (0)