From bea41f533a7ad6cfa4b688ed88c46496584b7214 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Mon, 13 Oct 2025 14:49:00 +0000 Subject: [PATCH 1/4] [mlir][linalg] Update vectorizatio of linalg.pack This patch changes `vectorizeAsTensorPackOp` to require users to specify all write-side vector sizes for `linalg.pack` (not just the outer dimensions). This makes `linalg.pack` vectorization consistent with `linalg.unpack` (see #149293 for a similar change). Conceptually, `linalg.pack` consists of these high-level steps: * **Read** from the source tensor using `vector.transfer_read`. * **Re-associate** dimensions of the transposed value, as specified by the op (via `vector.shape_cast`) * **Transpose** the re-associated value according to the permutation in the `linalg.pack` op (via `vector.transpose`). * **Write** the result into the destination tensor via `vector.transfer_write`. Previously, the vector sizes provided by the user were interpreted as write-vector-sizes for PackOp _outer_ dims (i.e. the final step above). These were used to: * Infer read-vector-sizes using the `inner_tiles` attribute of PackOp. * Deduce vector sizes for the transpose and shape cast operations. * Ultimately determine the vector shape for the read. However, this logic breaks when one or more tile sizes are dynamic (*). In such cases, `vectorizePackOpPrecondition` would currently fail (see `@pack_with_dynamic_dims_and_dynamic_inner_tile` added in this PR - without this change it will crash). This patch updates the contract: users now directly specify _all_ the "write-vector-sizes", which inherently encode all inner tile sizes - including dynamic ones. It becomes the user's responsibility to provide valid sizes. In practice, since `linalg.pack` is typically constructed, tiled, and vectorized by the same transformation pipeline, the necessary "write-vector-sizes" should be recoverable. Notes for reviewers: * See test updates for user-facing impact. * Review `vectorizeAsTensorPackOp` as a new implementation rather than a diff. * Comments and variable names were updated to align with `vectorizeAsTensorUnPackOp`. (*) As a concrete example, "scalable" tile sizes are represent as dynamic values. Note, support for "scalable" vectorisation will be added in a separate PR. --- .../include/mlir/Dialect/Linalg/Utils/Utils.h | 3 +- .../Dialect/Linalg/Transforms/Transforms.cpp | 3 +- .../Linalg/Transforms/Vectorization.cpp | 218 +++++++++--------- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 8 +- .../linalg-ops-with-patterns.mlir | 2 + .../Linalg/vectorization/linalg-ops.mlir | 98 +++++++- 6 files changed, 217 insertions(+), 115 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index 48978eb7663d5..49c75f4b00280 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -37,7 +37,8 @@ namespace linalg { /// This function uses the helper function `computePackUnPackPerm` to get /// the permutation vector. Only major difference between UnPack and Pack is /// that packOp uses destination rank whereas unpack Uses source rank. -SmallVector getPackInverseDestPerm(linalg::PackOp packOp); +SmallVector getPackInverseDestPerm(linalg::PackOp packOp, + PackingMetadata &metadatap); /// Shell function to compute the Source Permutation of unPackOp. /// This function, like the getPackInverseDestPerm uses the helper function diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index eb2d825e17e44..12b6da774701c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -234,8 +234,9 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, // before any outer or inner permutations have been applied. PackingMetadata packingMetadata = computePackingMetadata( packedTensorType.getRank(), packOp.getInnerDimsPos()); + PackingMetadata packMetadata; SmallVector packedToStripMinedShapePerm = - getPackInverseDestPerm(packOp); + getPackInverseDestPerm(packOp, packMetadata); // 3. Compute the stripMinedShape: this is the packed shape before any outer // or inner permutations have been applied. diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 9d62491214018..591bae5d7a157 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1564,13 +1564,6 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, return success(); } -/// Given a linalg::PackOp, return the `dest` shape before any packing -/// permutations. -static SmallVector getTiledPackShape(linalg::PackOp packOp, - ArrayRef destShape) { - return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp)); -} - /// Determines whether a mask for xfer_write is trivially "all true" /// /// Given all the inputs required to generate a mask (mask sizes and shapes), @@ -1761,99 +1754,6 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore, return mlir::vector::maskOperation(builder, write, maskForWrite); } -/// Vectorize linalg::PackOp with (1) static inner_tiles (2) constant -/// padding value and (3) input vector sizes into: -/// -/// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds -/// -/// As in the following example: -/// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2] -/// into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32> -/// -/// This pack would be vectorized to: -/// -/// %load = vector.mask %mask { -/// vector.transfer_read %arg0[%c0, %c0, %c0], %cst -/// {in_bounds = [true, true, true]} : -/// tensor<32x7x16xf32>, vector<32x8x16xf32> -/// } : vector<32x8x16xi1> -> vector<32x8x16xf32> -/// %shape_cast = vector.shape_cast %load : vector<32x8x16xf32> -/// to vector<32x4x2x1x16xf32> -/// %transpose = vector.transpose %shape_cast, [0, 1, 3, 4, 2] -/// : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> -/// %write = vector.transfer_write %transpose, -/// %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0] -/// {in_bounds = [true, true, true, true, true]} -/// : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> -/// -/// If the (3) input vector sizes are not provided, the vector sizes are -/// determined by the result tensor shape and the `in_bounds` -/// attribute is used instead of masking to mark out-of-bounds accesses. -/// -/// NOTE: The input vector sizes specify the dimensions corresponding to the -/// outer dimensions of the output tensor. The remaining dimensions are -/// computed based on, e.g., the static inner tiles. -/// Supporting dynamic inner tiles will require the user to specify the -/// missing vector sizes. This is left as a TODO. -static LogicalResult -vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, - ArrayRef inputVectorSizes, - SmallVectorImpl &newResults) { - // TODO: Introduce a parent class that will handle the insertion point update. - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(packOp); - - Location loc = packOp.getLoc(); - std::optional padValue = packOp.getPaddingValue() - ? std::optional(packOp.getPaddingValue()) - : std::nullopt; - - // If the input vector sizes are not provided, then the vector sizes are - // determined by the result tensor shape. In case the vector sizes aren't - // provided, we update the inBounds attribute instead of masking. - bool useInBoundsInsteadOfMasking = false; - if (inputVectorSizes.empty()) { - ArrayRef resultTensorShape = packOp.getDestType().getShape(); - inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank()); - useInBoundsInsteadOfMasking = true; - } - - // Create masked TransferReadOp. - SmallVector inputShape(inputVectorSizes); - auto innerTiles = packOp.getStaticInnerTiles(); - auto innerDimsPos = packOp.getInnerDimsPos(); - auto outerDimsPerm = packOp.getOuterDimsPerm(); - if (!outerDimsPerm.empty()) - applyPermutationToVector(inputShape, - invertPermutationVector(outerDimsPerm)); - for (auto [idx, size] : enumerate(innerTiles)) - inputShape[innerDimsPos[idx]] *= size; - auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, packOp.getSource(), inputShape, padValue, - useInBoundsInsteadOfMasking, - /*inputScalableVecSizes=*/{}); - - // Create ShapeCastOp. - SmallVector destShape(inputVectorSizes); - destShape.append(innerTiles.begin(), innerTiles.end()); - auto tiledPackType = VectorType::get(getTiledPackShape(packOp, destShape), - packOp.getDestType().getElementType()); - auto shapeCastOp = - vector::ShapeCastOp::create(rewriter, loc, tiledPackType, maskedRead); - - // Create TransposeOp. - auto destPermutation = - invertPermutationVector(getPackInverseDestPerm(packOp)); - auto transposeOp = vector::TransposeOp::create( - rewriter, loc, shapeCastOp.getResult(), destPermutation); - - // Create TransferWriteOp. - Operation *write = createWriteOrMaskedWrite( - rewriter, loc, transposeOp.getResult(), packOp.getDest()); - newResults.push_back(write->getResult(0)); - return success(); -} - /// Given the re-associations, "collapses" the input Vector type /// /// This is similar to CollapseShapeOp::inferCollapsedType with two notable @@ -1901,12 +1801,120 @@ static VectorType getCollapsedVecType(VectorType type, return VectorType::get(newShape, type.getElementType(), newScalableFlags); } +/// Vectorize `linalg.pack` as: +/// * xfer_read -> shape_cast -> transpose -> xfer_write +/// +/// The input-vector-sizes specify the _write_ vector sizes (i.e. the vector +/// sizes for the xfer_write operation). This is sufficient to infer the other +/// vector sizes required here. +/// +/// If the vector sizes are not provided: +/// * the vector sizes are determined from the destination tensor static shape. +/// * the inBounds attribute is used instead of masking. +/// +/// EXAMPLE (no vector sizes): +/// ``` +/// %pack = tensor.pack %src +/// inner_dims_pos = [2, 1] +/// inner_tiles = [16, 2] +/// into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32> +/// `` +/// is vectorizes as: +/// ``` +/// %read = vector.transfer_read %src +/// : tensor<32x7x16xf32>, vector<32x8x16xf32> +/// %sc = vector.shape_cast %read +/// : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> +/// %tr = vector.transpose %sc, [0, 1, 3, 4, 2] +/// : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> +/// %write = vector.transfer_write %tr into %dest +/// : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> +/// ``` +static LogicalResult +vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, + ArrayRef inputVectorSizes, + SmallVectorImpl &newResults) { + if (!inputVectorSizes.empty()) { + assert(inputVectorSizes.size() == packOp.getDestRank() && + "Invalid number of input vector sizes!"); + } + + // TODO: Introduce a parent class that will handle the insertion point update. + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(packOp); + + Location loc = packOp.getLoc(); + std::optional padValue = packOp.getPaddingValue() + ? std::optional(packOp.getPaddingValue()) + : std::nullopt; + + SmallVector destShape = + SmallVector(packOp.getDestType().getShape()); + + // This is just a convenience alias to clearly communicate that the input + // vector sizes determine the _write_ sizes. + ArrayRef &writeVectorSizes = inputVectorSizes; + + // In the absence of input-vector-sizes, use the _static_ input tensor shape. + // In addition, use the inBounds attribute instead of masking. + bool useInBoundsInsteadOfMasking = false; + if (writeVectorSizes.empty()) { + if (ShapedType::isDynamicShape(destShape)) + return rewriter.notifyMatchFailure(packOp, + "Unable to infer vector sizes!"); + + writeVectorSizes = destShape; + useInBoundsInsteadOfMasking = true; + } + + // Compute vector type for the _read_ opeartion. The required dims are + // determined based on the _write_ vector sizes. This is done in two + // steps: + // 1) Invert the permutation/transposition that's part of the Pack + // operation. + // 2) Collapse the tiled sizes/dims to "return" to the unpacked domain. + PackingMetadata packMetadata; + auto destInvPermutation = getPackInverseDestPerm(packOp, packMetadata); + + SmallVector writeVecSizesUnpermuted(writeVectorSizes); + applyPermutationToVector(writeVecSizesUnpermuted, destInvPermutation); + + VectorType readVecType = getCollapsedVecType( + VectorType::get(writeVecSizesUnpermuted, + packOp.getType().getElementType()), + getSymbolLessAffineMaps(convertReassociationIndicesToExprs( + rewriter.getContext(), packMetadata.reassociations))); + + // Create masked TransferReadOp. + auto maskedRead = vector::createReadOrMaskedRead( + rewriter, loc, packOp.getSource(), readVecType.getShape(), padValue, + useInBoundsInsteadOfMasking, + /*inputScalableVecSizes=*/{}); + + // Create ShapeCastOp. + auto expandedVecType = VectorType::get(writeVecSizesUnpermuted, + packOp.getType().getElementType()); + auto shapeCastOp = + vector::ShapeCastOp::create(rewriter, loc, expandedVecType, maskedRead); + + // Create TransposeOp. + auto destPermutation = invertPermutationVector(destInvPermutation); + auto transposeOp = vector::TransposeOp::create( + rewriter, loc, shapeCastOp.getResult(), destPermutation); + + // Create TransferWriteOp. + Operation *write = createWriteOrMaskedWrite( + rewriter, loc, transposeOp.getResult(), packOp.getDest()); + newResults.push_back(write->getResult(0)); + return success(); +} + /// Vectorize `linalg.unpack` as: /// * xfer_read -> vector.transpose -> vector.shape_cast -> xfer_write /// -/// The input-vector-sizes specify the read vector sizes (i.e. the vector sizes -/// for the xfer_read operation). This is sufficient to infer the other vector -/// sizes required here. +/// The input-vector-sizes specify the _read_ vector sizes (i.e. the vector +/// sizes for the xfer_read operation). This is sufficient to infer the other +/// vector sizes required here. /// /// If the vector sizes are not provided: /// * the vector sizes are determined from the input tensor static shape. @@ -1960,7 +1968,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, // In the absence of input-vector-sizes, use the _static_ input tensor shape. if (inputVectorSizes.empty()) { if (ShapedType::isDynamicShape(sourceShape)) - return failure(); + return rewriter.notifyMatchFailure(unpackOp, + "Unable to infer vector sizes!"); readVectorSizes.assign(sourceShape.begin(), sourceShape.end()); useInBoundsInsteadOfMasking = true; @@ -2443,6 +2452,7 @@ vectorizePackOpPrecondition(linalg::PackOp packOp, ArrayRef inputVectorSizes) { auto padValue = packOp.getPaddingValue(); Attribute cstAttr; + // TODO: Relax this condiiton if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) { LDBG() << "pad value is not constant: " << packOp; return failure(); diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 24d3722cf5426..a91397d29f3e3 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -171,9 +171,9 @@ computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, namespace mlir { namespace linalg { -SmallVector getPackInverseDestPerm(PackOp packOp) { +SmallVector getPackInverseDestPerm(PackOp packOp, + PackingMetadata &pMetadata) { - PackingMetadata pMetadata; int64_t packedRank = packOp.getDestType().getRank(); ArrayRef innerDimPos = packOp.getInnerDimsPos(); ArrayRef outerPerm = packOp.getOuterDimsPerm(); @@ -189,11 +189,11 @@ SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp) { SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp, PackingMetadata &metadata) { - int64_t unpackRank = unpackOp.getSourceType().getRank(); + int64_t packedRank = unpackOp.getSourceType().getRank(); ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); SmallVector unpackInvSrcPerm = - computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); + computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata); return unpackInvSrcPerm; } diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index 93a03369be239..cd472802dd307 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -285,6 +285,8 @@ module attributes {transform.with_named_sequence} { ///---------------------------------------------------------------------------------------- /// Tests for linalg.pack +/// +/// TODO: Add similar tests for linalg.unpack ///---------------------------------------------------------------------------------------- // Note, see a similar test in: diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 1304a90349f71..6d3544ff4f23d 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -1335,7 +1335,7 @@ func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.pack"]} in %src : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, 1, 32, 16, 2] : !transform.any_op transform.yield } } @@ -1378,7 +1378,7 @@ func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [32, 4, 1, 16, 2] : !transform.any_op transform.yield } } @@ -1424,8 +1424,15 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func @pack_with_dynamic_dims // CHECK-SAME: %[[SRC:.*]]: tensor, // CHECK-SAME: %[[DEST:.*]]: tensor -func.func @pack_with_dynamic_dims(%src: tensor, %dest: tensor) -> tensor { - %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor -> tensor +func.func @pack_with_dynamic_dims( + %src: tensor, + %dest: tensor) -> tensor { + + %pack = linalg.pack %src + inner_dims_pos = [1, 0] + inner_tiles = [16, 2] + into %dest : tensor -> tensor + return %pack : tensor } @@ -1433,30 +1440,111 @@ func.func @pack_with_dynamic_dims(%src: tensor, %dest: tensor // CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor // CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1> + +/// --= read =--- // CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { // CHECK-SAME: vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]] // CHECK-SAME: {in_bounds = [true, true]} : tensor, vector<8x16xf32> // CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32> + +/// --= shape_cast =--- // CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32> + +/// --= transpose =--- // CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32> + +/// Compute mask for xfer_write // CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor // CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor // CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[C16]], %[[C2]] : vector<4x1x16x2xi1> + +/// --= write =--- // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] { // CHECK-SAME: vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]] // CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor + // CHECK: return %[[WRITE]] : tensor module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op + transform.yield + } +} + +// ----- + +/// Similar to the test above, but one of the inner tile sizes is dynamic. As a +/// result, more output dims are dynamic (and, e.g., output mask calcuation is a bit different). + +// CHECK-LABEL: func @pack_with_dynamic_dims_and_dynamic_inner_tile +// CHECK-SAME: %[[SRC:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor +func.func @pack_with_dynamic_dims_and_dynamic_inner_tile( + %src: tensor, + %dest: tensor) -> tensor { + + %c16 = arith.constant 16 : index + + %pack = linalg.pack %src + inner_dims_pos = [1, 0] + inner_tiles = [%c16, 2] + into %dest : tensor -> tensor + + return %pack : tensor +} + +// CHECK-DAG: %[[CST:.*]] = ub.poison : f32 +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index + +/// Compute mask for xfer_read +// CHECK-DAG: %[[D0_0:.*]] = tensor.dim {{.*}} %[[C0_0]] : tensor +// CHECK-DAG: %[[D1_0:.*]] = tensor.dim {{.*}} %[[C1_0]] : tensor +// CHECK: %[[MASK:.*]] = vector.create_mask %[[D0_0]], %[[D1_0]] : vector<8x16xi1> + +/// --= read =--- +// CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { +// CHECK-SAME: vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[CST]] +// CHECK-SAME: {in_bounds = [true, true]} : tensor, vector<8x16xf32> +// CHECK-SAME: } : vector<8x16xi1> -> vector<8x16xf32> + +/// --= shape_cast =--- +// CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<8x16xf32> to vector<4x2x1x16xf32> + +/// --= transpose =--- +// CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 2, 3, 1] : vector<4x2x1x16xf32> to vector<4x1x16x2xf32> + +/// Compute mask for xfer_write +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor +// CHECK-DAG: %[[D3:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor +// CHECK-DAG: %[[D4:.*]] = tensor.dim %[[DEST]], {{.*}} : tensor +// CHECK: %[[MASK_0:.*]] = vector.create_mask %[[D2]], %[[D3]], %[[D4]], %[[C2_2]] : vector<4x1x16x2xi1> + +/// --= write =--- +// CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_0]] { +// CHECK-SAME: vector.transfer_write %[[TR]], %[[DEST]][%[[C0_2]], %[[C0_2]], %[[C0_2]], %[[C0_2]]] +// CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor + +// CHECK: return %[[WRITE]] : tensor + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, 1, 16, 2] : !transform.any_op transform.yield } } From 182bb0cf822dc31630b225f06205af3341896259 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Tue, 21 Oct 2025 12:01:30 +0000 Subject: [PATCH 2/4] Revert some unnecessary changes --- mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 12b6da774701c..0215675936cb8 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -232,11 +232,9 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, // 2. Compute the permutation vector to shuffle packed shape into the shape // before any outer or inner permutations have been applied. - PackingMetadata packingMetadata = computePackingMetadata( - packedTensorType.getRank(), packOp.getInnerDimsPos()); - PackingMetadata packMetadata; + PackingMetadata packingMetadata; SmallVector packedToStripMinedShapePerm = - getPackInverseDestPerm(packOp, packMetadata); + getPackInverseDestPerm(packOp, packingMetadata); // 3. Compute the stripMinedShape: this is the packed shape before any outer // or inner permutations have been applied. From 2b2204764e25d611f070402afe4a6ef40d6fcf37 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Sat, 25 Oct 2025 19:44:46 +0000 Subject: [PATCH 3/4] Address comments from HanHan Note, I deleted `getUnPackInverseSrcPerm(UnPackOp unpackOp)` - I couldn't find any uses of that hook. --- .../include/mlir/Dialect/Linalg/Utils/Utils.h | 22 +++++-------------- .../Linalg/Transforms/Vectorization.cpp | 2 +- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 9 ++------ .../Linalg/vectorization/linalg-ops.mlir | 5 ----- 4 files changed, 9 insertions(+), 29 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index 49c75f4b00280..a4e59cb6b30b8 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -33,23 +33,13 @@ namespace linalg { //===----------------------------------------------------------------------===// // Utilities for inferring various semantics properties of Linalg ops. //===----------------------------------------------------------------------===// -/// Shell function to compute the Destination Permutation of PackOp -/// This function uses the helper function `computePackUnPackPerm` to get -/// the permutation vector. Only major difference between UnPack and Pack is -/// that packOp uses destination rank whereas unpack Uses source rank. +/// Compute inverse permutation for the destination tensor (i.e. in the packed +/// domain). SmallVector getPackInverseDestPerm(linalg::PackOp packOp, - PackingMetadata &metadatap); - -/// Shell function to compute the Source Permutation of unPackOp. -/// This function, like the getPackInverseDestPerm uses the helper function -/// computePackUnPackPerm` to get the permutation vector. -/// Only major difference between UnPack and Pack is that packOp uses -/// destination rank whereas unpack Uses source rank. -SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp unpackOp); - -/// Shell function to compute the Source rank permutation for unpackOp -/// Unpack requires some packing metadata data information, so created -/// another function where this value is passed by reference. + PackingMetadata &metadata); + +/// Compute inverse permutation for the source tensor (i.e. in the packed +/// domain). SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp, PackingMetadata &metadata); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 591bae5d7a157..43add5c91273e 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1861,7 +1861,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, if (writeVectorSizes.empty()) { if (ShapedType::isDynamicShape(destShape)) return rewriter.notifyMatchFailure(packOp, - "Unable to infer vector sizes!"); + "unable to infer vector sizes"); writeVectorSizes = destShape; useInBoundsInsteadOfMasking = true; diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index a91397d29f3e3..6eeb2063e0a9e 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -172,21 +172,16 @@ namespace mlir { namespace linalg { SmallVector getPackInverseDestPerm(PackOp packOp, - PackingMetadata &pMetadata) { + PackingMetadata &metadata) { int64_t packedRank = packOp.getDestType().getRank(); ArrayRef innerDimPos = packOp.getInnerDimsPos(); ArrayRef outerPerm = packOp.getOuterDimsPerm(); SmallVector packInvDestPerm = - computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); + computePackUnPackPerm(packedRank, innerDimPos, outerPerm, metadata); return packInvDestPerm; } -SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp) { - PackingMetadata metadata; - return getUnPackInverseSrcPerm(unpackOp, metadata); -} - SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp, PackingMetadata &metadata) { int64_t packedRank = unpackOp.getSourceType().getRank(); diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 6d3544ff4f23d..170bae6141609 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -1427,12 +1427,10 @@ module attributes {transform.with_named_sequence} { func.func @pack_with_dynamic_dims( %src: tensor, %dest: tensor) -> tensor { - %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor -> tensor - return %pack : tensor } @@ -1492,14 +1490,11 @@ module attributes {transform.with_named_sequence} { func.func @pack_with_dynamic_dims_and_dynamic_inner_tile( %src: tensor, %dest: tensor) -> tensor { - %c16 = arith.constant 16 : index - %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [%c16, 2] into %dest : tensor -> tensor - return %pack : tensor } From 873c4f310e52cb0620b7089515f7b8b8effccf3a Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Mon, 27 Oct 2025 23:50:55 +0000 Subject: [PATCH 4/4] Add an empty line --- mlir/include/mlir/Dialect/Linalg/Utils/Utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index a4e59cb6b30b8..de07f500a8669 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -33,6 +33,7 @@ namespace linalg { //===----------------------------------------------------------------------===// // Utilities for inferring various semantics properties of Linalg ops. //===----------------------------------------------------------------------===// + /// Compute inverse permutation for the destination tensor (i.e. in the packed /// domain). SmallVector getPackInverseDestPerm(linalg::PackOp packOp,