diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 1dc42f71e10ef..2a3ce0519ab53 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1731,11 +1731,6 @@ void populateDecomposePadPatterns(RewritePatternSet &patterns); /// \see rewriteInIm2Col for more details. void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns); -/// Populates `patterns` with vectorisation patterns for tensor.insert_slice. -/// TODO: Avoid having a dedicated `populate{}` for one pattern. Instead, either -/// expand or merge with other `populate{}`. -void populateInsertSliceVectorizationPatterns(RewritePatternSet &patterns); - /// Populates `patterns` with patterns that vectorize tensor.pad. /// These patterns are meant to apply in a complementary fashion. Benefits /// are used to encode a certain ordering of pattern application. To avoid diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 8f5b49e0c2130..51d1df52598c7 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -265,7 +265,6 @@ void transform::ApplyFoldAddIntoDestPatternsOp::populatePatterns( void transform::ApplyPadVectorizationPatternsOp::populatePatterns( RewritePatternSet &patterns) { linalg::populatePadOpVectorizationPatterns(patterns); - linalg::populateInsertSliceVectorizationPatterns(patterns); } //===----------------------------------------------------------------------===// @@ -3504,9 +3503,6 @@ transform::VectorizeChildrenAndApplyPatternsOp::applyToOne( patterns.add(ctx); - // Add misc. vectorization patterns (e.g. for tensor.insert_slice) - linalg::populateInsertSliceVectorizationPatterns(patterns); - if (getVectorizePadding()) { linalg::populatePadOpVectorizationPatterns(patterns); // This creates an alternative path for lowering tensor.pad - by diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 299bbc226dec8..a43fa86166e83 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -59,6 +59,37 @@ vectorizeConvolution(RewriterBase &rewriter, LinalgOp convOp, ArrayRef inputVecScalableFlags = {}, bool flatten1DDepthwiseConv = false); +/// Vectorize tensor::InsertSliceOp with: +/// * vector::TransferReadOp + vector::TransferWriteOp +/// The vector sizes are either: +/// * user-provided in `inputVectorSizes`, or +/// * inferred from the static dims in the input and output tensors. +/// Bails out if: +/// * vector sizes are not user-provided, and +/// * at least one dim is dynamic (in both the input and output tensors). +/// +/// Before: +/// !t_in_type = tensor<1x2x3xf32> +/// !t_out_type = tensor<9x8x7x1x2x3xf32> +/// !v_type = vector<1x2x3xf32> +/// %inserted_slice = tensor.insert_slice %src into %dest ... : !t_in_type +/// into !t_out_type +/// After: +/// %read = vector.transfer_read %src[...], %pad ... : !t_in_type, !v_type +/// %write = vector.transfer_write %read, %dest ... : !v_type, !t_out_type +static LogicalResult +vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp, + ArrayRef inputVectorSizes, + SmallVectorImpl &newResults); + +/// Returns the effective Pad value for the input op, provided it's a scalar. +/// +/// Many Ops exhibit pad-like behaviour, but this isn't always explicit. If +/// this Op performs padding, retrieve the padding value provided that it's +/// a scalar and static/fixed for all the padded values. Returns an empty value +/// otherwise. +static Value getStaticPadVal(Operation *op); + /// Return the unique instance of OpType in `block` if it is indeed unique. /// Return null if none or more than 1 instances exist. template @@ -1557,6 +1588,7 @@ static LogicalResult vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { + // TODO: Introduce a parent class that will handle the insertion point update. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(packOp); @@ -1633,6 +1665,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { + // TODO: Introduce a parent class that will handle the insertion point update. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(unpackOp); @@ -1763,7 +1796,7 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, auto padValue = padOp.getConstantPaddingValue(); Location loc = padOp.getLoc(); - // transfer_write_in_bounds(transfer_read_masked(pad_source, pad_value)) + // TODO: Introduce a parent class that will handle the insertion point update. OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(padOp); @@ -1874,6 +1907,38 @@ vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp, return success(); } +static LogicalResult +vectorizeInsertSliceOpPrecondition(tensor::InsertSliceOp sliceOp, + ArrayRef inputVectorSizes) { + + TypedValue source = sliceOp.getSource(); + auto sourceType = source.getType(); + if (!VectorType::isValidElementType(sourceType.getElementType())) + return failure(); + + // Get the pad value. + // TransferReadOp (which is used to vectorize InsertSliceOp), requires a + // scalar padding value. Note that: + // * for in-bounds accesses, + // the value is actually irrelevant. There are 2 cases in which xfer.read + // accesses are known to be in-bounds: + // 1. The source shape is static (output vector sizes would be based on + // the source shape and hence all memory accesses would be in-bounds), + // 2. Masking is used, i.e. the output vector sizes are user-provided. In + // this case it is safe to assume that all memory accesses are in-bounds. + // + // When the value is not known and not needed, use 0. Otherwise, bail out. + Value padValue = getStaticPadVal(sliceOp); + bool isOutOfBoundsRead = + !sourceType.hasStaticShape() && inputVectorSizes.empty(); + + if (!padValue && isOutOfBoundsRead) { + LDBG("Failed to get a pad value for out-of-bounds read access\n"); + return failure(); + } + return success(); +} + static LogicalResult vectorizeLinalgOpPrecondition( LinalgOp linalgOp, ArrayRef inputVectorSizes, bool vectorizeNDExtract, bool flatten1DDepthwiseConv) { @@ -2144,6 +2209,9 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition( .Case([&](auto unpackOp) { return vectorizeUnPackOpPrecondition(unpackOp, inputVectorSizes); }) + .Case([&](auto sliceOp) { + return vectorizeInsertSliceOpPrecondition(sliceOp, inputVectorSizes); + }) .Default([](auto) { return failure(); }); } @@ -2163,8 +2231,8 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) { } bool mlir::linalg::hasVectorizationImpl(Operation *op) { - return isa( - op); + return isa(op); } /// Emit a suitable vector form for an operation. If provided, @@ -2244,6 +2312,10 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, return vectorizeAsTensorPackOp(rewriter, packOp, inputVectorSizes, results); }) + .Case([&](auto sliceOp) { + return vectorizeAsInsertSliceOp(rewriter, sliceOp, inputVectorSizes, + results); + }) .Case([&](auto unpackOp) { return vectorizeAsTensorUnpackOp(rewriter, unpackOp, inputVectorSizes, results); @@ -2540,6 +2612,9 @@ struct PadOpVectorizationWithTransferWritePattern /// this Op performs padding, retrieve the padding value provided that it's /// a scalar and static/fixed for all the padded values. Returns an empty value /// otherwise. +/// +/// TODO: This is used twice (when checking vectorization pre-conditions and +/// when vectorizing). Cache results instead of re-running. static Value getStaticPadVal(Operation *op) { if (!op) return {}; @@ -2583,113 +2658,118 @@ static Value getStaticPadVal(Operation *op) { return {}; } -/// Rewrite tensor.insert.slice as a vector.transfer_read + -/// vector.transfer_write pair. The vector size is inferred from the static -/// dims in the input and output tensors. If a dim is dynamic in both the input -/// and output tensors, bails out. -/// -/// Before: -/// !t_in_type = tensor<1x2x3xf32> -/// !t_out_type = tensor<9x8x7x1x2x3xf32> -/// !v_type = vector<1x2x3xf32> -/// %inserted_slice = tensor.insert_slice %src into %dest ... : !t_in_type -/// into !t_out_type -/// After: -/// %read = vector.transfer_read %src[...], %pad ... : !t_in_type, !v_type -/// %write = vector.transfer_write %read, %dest ... : !v_type, !t_out_type -/// -/// TODO: Support masking -struct InsertSliceVectorizePattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +static LogicalResult +vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp, + ArrayRef inputVectorSizes, + SmallVectorImpl &newResults) { + // TODO: Introduce a parent class that will handle the insertion point update. + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(sliceOp); - LogicalResult matchAndRewrite(tensor::InsertSliceOp sliceOp, - PatternRewriter &rewriter) const final { - auto sourceType = sliceOp.getSource().getType(); - if (!VectorType::isValidElementType(sourceType.getElementType())) - return failure(); + TypedValue source = sliceOp.getSource(); + auto sourceType = source.getType(); + auto resultType = sliceOp.getResultType(); - auto resultType = sliceOp.getResultType(); - - // 1. Get the pad value. - // TransferReadOp requires a scalar padding value. Note that: - // * for in-bounds access, the value is actually irrelevant. - // There are 2 cases in which xfer.read accesses are known to be in-bounds: - // 1. The source shape is static (output vector sizes would be based on - // the source shape and hence all memory accesses would be in-bounds), - // 2. Masking is used (output vector sizes would be user-provided, in which - // case it is assumed that all memory accesses are in-bounds). This - // remains a TODO. - // - // When the value is not known and not needed, use 0. Otherwise, bail out. - Value padValue = getStaticPadVal(sliceOp); - bool isOutOfBoundsRead = !sourceType.hasStaticShape(); - - if (!padValue && isOutOfBoundsRead) { - LDBG("Failed to get a pad value for out-of-bounds read access\n"); + Value padValue = getStaticPadVal(sliceOp); + + if (!padValue) { + auto elemType = sourceType.getElementType(); + padValue = rewriter.create( + sliceOp.getLoc(), elemType, rewriter.getZeroAttr(elemType)); + } + + // 2. Get the vector shape and in-bounds attributes + SmallVector vecShape; + SmallVector readInBounds; + SmallVector writeInBounds; + size_t rankDiff = resultType.getRank() - sourceType.getRank(); + for (int64_t i = 0, end = sourceType.getRank(); i < end; ++i) { + if (!inputVectorSizes.empty()) { + vecShape.push_back(inputVectorSizes[i]); + readInBounds.push_back(false); + writeInBounds.push_back(false); + } else if (!sourceType.isDynamicDim(i)) { + vecShape.push_back(sourceType.getDimSize(i)); + // Source shape is statically known: Neither read nor write are + // out-of-bounds. + readInBounds.push_back(true); + writeInBounds.push_back(true); + } else if (!resultType.isDynamicDim(i)) { + // Source shape is not statically known, but result shape is. + // Vectorize with size of result shape. This may be larger than the + // source size. + // FIXME: Using rankDiff implies that the source tensor is inserted at + // the end of the destination tensor. However, that's not required. + vecShape.push_back(resultType.getDimSize(rankDiff + i)); + // Read may be out-of-bounds because the result size could be larger + // than the source size. + readInBounds.push_back(false); + // Write will be in-bounds provided that the corresponding write idx is 0. + // To keep this logic simple, conservatively mark as out-of-bounds. + writeInBounds.push_back(false); + } else { + // Neither source nor result dim of padOp is static. Cannot vectorize + // the copy. + // TODO: Add support for masking return failure(); } + } + auto vecType = VectorType::get(vecShape, sourceType.getElementType()); - if (!padValue) { - auto elemType = sourceType.getElementType(); - padValue = rewriter.create( - sliceOp.getLoc(), elemType, rewriter.getZeroAttr(elemType)); - } + // 3. Generate TransferReadOp. + SmallVector readIndices( + vecType.getRank(), + rewriter.create(sliceOp.getLoc(), 0)); + Operation *read = rewriter.create( + sliceOp.getLoc(), vecType, source, readIndices, padValue, + ArrayRef{readInBounds}); - // 2. Get the vector shape and in-bounds attributes - SmallVector vecShape; - SmallVector readInBounds; - SmallVector writeInBounds; - size_t rankDiff = resultType.getRank() - sourceType.getRank(); - for (unsigned i = 0; i < sourceType.getRank(); ++i) { - if (!sourceType.isDynamicDim(i)) { - vecShape.push_back(sourceType.getDimSize(i)); - // Source shape is statically known: Neither read nor write are - // out-of-bounds. - readInBounds.push_back(true); - writeInBounds.push_back(true); - } else if (!resultType.isDynamicDim(i)) { - // Source shape is not statically known, but result shape is. - // Vectorize with size of result shape. This may be larger than the - // source size. - // FIXME: Using rankDiff implies that the source tensor is inserted at - // the end of the destination tensor. However, that's not required. - vecShape.push_back(resultType.getDimSize(rankDiff + i)); - // Read may be out-of-bounds because the result size could be larger - // than the source size. - readInBounds.push_back(false); - // Write will in-bounds provided that the corresponding write idx is 0. - // To keep this logic simple, conservatively mark as out-of-bounds. - writeInBounds.push_back(false); - } else { - // Neither source nor result dim of padOp is static. Cannot vectorize - // the copy. - // TODO: Add support for masking - return failure(); - } + // If vector sizes are user provided, make sure to mask xfer_read. + if (!inputVectorSizes.empty()) { + auto *srcDefOp = source.getDefiningOp(); + if (!srcDefOp) { + LDBG("Unable to get the defining Op of " << sliceOp); + return failure(); } - auto vecType = VectorType::get(vecShape, sourceType.getElementType()); - // 3. Generate TransferReadOp. - SmallVector readIndices( - vecType.getRank(), - rewriter.create(sliceOp.getLoc(), 0)); - auto read = rewriter.create( - sliceOp.getLoc(), vecType, sliceOp.getSource(), readIndices, padValue, - ArrayRef{readInBounds}); + ReifiedRankedShapedTypeDims reifiedSrcSizes; + LogicalResult status = + cast(srcDefOp).reifyResultShapes( + rewriter, reifiedSrcSizes); + if (status.failed()) { + LDBG("Unable to reify result shapes of " << sliceOp); + return failure(); + } - // 4. Generate TransferWriteOp. - auto writeIndices = getValueOrCreateConstantIndexOp( - rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets()); + // Create the mask + SmallVector readMaskShape( + sliceOp.getSource().getType().getShape()); + auto readMaskType = VectorType::get(inputVectorSizes, rewriter.getI1Type()); + Value maskOp = rewriter.create( + sliceOp.getLoc(), readMaskType, reifiedSrcSizes[0]); - // 5. Finalize - rewriter.replaceOpWithNewOp( - sliceOp, read, sliceOp.getDest(), writeIndices, - ArrayRef{writeInBounds}); + // Mask the xfer_read Op + read = mlir::vector::maskOperation(rewriter, read, maskOp); + } - return success(); + // 4. Generate TransferWriteOp. + if (!inputVectorSizes.empty() && + ShapedType::isDynamicShape(resultType.getShape())) { + LDBG("TODO: Masking of xfer_write when vectorising " << sliceOp); + return failure(); } -}; + + auto writeIndices = getValueOrCreateConstantIndexOp( + rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets()); + + // 5. Finalize + Operation *write = rewriter.create( + sliceOp.getLoc(), read->getResult(0), sliceOp.getDest(), writeIndices, + ArrayRef{writeInBounds}); + newResults.push_back(write->getResult(0)); + + return success(); +} /// Rewrite use of tensor::PadOp result in InsertSliceOp. E.g.: /// ``` @@ -2778,11 +2858,6 @@ struct PadOpVectorizationWithInsertSlicePattern } }; -void mlir::linalg::populateInsertSliceVectorizationPatterns( - RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} - void mlir::linalg::populatePadOpVectorizationPatterns( RewritePatternSet &patterns, PatternBenefit baseBenefit) { patterns.add tensor::EmptyOp + linalg::FillOp/tensor::GenerateOp + tensor::InsertSliceOp -/// [Pattern: GenericPadOpVectorizationPattern + InsertSliceVectorizePattern] -/// TODO: Split the test into two, one for each pattern. -///---------------------------------------------------------------------------------------- - func.func private @make_vector() -> tensor<12x13xf32> -// Same as @pad_and_insert_slice_dest in vectorization-with-patterns.mlir, but -// over here linalg::fill is not vectorized (patterns for linalg.fill are not -// included here) +// The destination of tensor.insert_slice matches the result of tensor.pad - +// not supported. + // CHECK-LABEL: func.func @pad_and_insert_slice_dest( -// CHECK-SAME: %[[ARG_0:.*]]: tensor<1x5x6xf32>) -> tensor<1x12x13xf32> { -// CHECK-NOT: tensor.pad -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[PAD:.*]] = arith.constant 5.000000e+00 : f32 -// CHECK-DAG: %[[PAD_READ:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<1x12x13xf32> -// CHECK: %[[FILL:.*]] = linalg.fill ins(%[[PAD]] : f32) outs(%[[EMPTY]] : tensor<1x12x13xf32>) -> tensor<1x12x13xf32> -// CHECK: %[[READ_1:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x5x6xf32>, vector<1x5x6xf32> -// CHECK: %[[WRITE_1:.*]] = vector.transfer_write %[[READ_1]], %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x5x6xf32>, tensor<1x12x13xf32> -// CHECK: %[[VEC:.*]] = call @make_vector() : () -> tensor<12x13xf32> -// CHECK: %[[READ_2:.*]] = vector.transfer_read %[[VEC]]{{\[}}%[[C0]], %[[C0]]], %[[PAD_READ]] {in_bounds = [true, true]} : tensor<12x13xf32>, vector<12x13xf32> -// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ_2]], %[[WRITE_1]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<12x13xf32>, tensor<1x12x13xf32> -// CHECK: return %[[RES]] : tensor<1x12x13xf32> +// CHECK-NOT: vector.transfer_read +// CHECK-NOT: vector.transfer_write func.func @pad_and_insert_slice_dest( %arg0: tensor<1x5x6xf32>) -> tensor<1x12x13xf32> { @@ -270,8 +252,6 @@ module attributes {transform.with_named_sequence} { %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func"> transform.apply_patterns to %func_op { - // TODO: Split into two tests, one for each pattern - transform.apply_patterns.linalg.decompose_pad transform.apply_patterns.linalg.pad_vectorization } : !transform.op<"func.func"> transform.yield diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 8fbc74ec345c6..be0180fcf1763 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -280,3 +280,26 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// One of the _destination_ dimensions is dynamic (but _source_ dimensions are static). + +func.func private @insert_slice_dynamic_dest_dim(%source: tensor, %size: index) -> tensor { + %c2 = arith.constant 2 : index + %init = tensor.empty(%size) : tensor + + %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor to tensor<5x1xi32> + // expected-error @+1 {{Attempted to vectorize, but failed}} + %res = tensor.insert_slice %source_slice into %init[0, %c2] [5, 1] [1, 1] : tensor<5x1xi32> into tensor + + return %res : tensor +} + + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op + transform.yield + } + } diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index 0f2abe06569d6..a660144ab87fb 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -66,7 +66,7 @@ func.func @vectorize_dynamic_identity_with_constant(%arg0: tensor, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %size = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %size = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [%size] : !transform.any_op, !transform.any_op transform.yield } @@ -690,7 +690,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op - transform.yield + transform.yield } } @@ -727,7 +727,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op - transform.yield + transform.yield } } @@ -768,7 +768,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op - transform.yield + transform.yield } } @@ -933,7 +933,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op transform.yield - } + } } // ----- @@ -957,7 +957,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield - } + } } // ----- @@ -981,7 +981,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield - } + } } // ----- @@ -1022,7 +1022,7 @@ func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32> // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index // CHECK: %[[transfer_read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]] -// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> +// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> // CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> // CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> // CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index @@ -1059,7 +1059,7 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield - } + } } // ----- @@ -1083,10 +1083,10 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield - } + } } - // ----- +// ----- func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> { %0 = tensor.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> @@ -1106,5 +1106,81 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield - } + } + } + +// ----- + +///---------------------------------------------------------------------------------------- +/// tensor.insert_slice +///---------------------------------------------------------------------------------------- + +func.func private @insert_slice_static_sizes(%source: tensor) -> tensor<5x3xi32> { + %c2 = arith.constant 2 : index + %init = tensor.empty() : tensor<5x3xi32> + + %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor to tensor<5x1xi32> + %res = tensor.insert_slice %source_slice into %init[0, %c2] [5, 1] [1, 1] : tensor<5x1xi32> into tensor<5x3xi32> + + return %res : tensor<5x3xi32> +} + +// CHECK-LABEL: func.func private @insert_slice_static_sizes( +// CHECK-SAME: %[[SEC:.*]]: tensor) -> tensor<5x3xi32> { +// CHECK: %[[C_2:.*]] = arith.constant 2 : index +// CHECK: %[[INIT:.*]] = tensor.empty() : tensor<5x3xi32> +// CHECK: %[[SRC_SLICE:.*]] = tensor.extract_slice %[[SEC]][0, %[[C_2]], 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor to tensor<5x1xi32> +// CHECK: %[[PAD:.*]] = arith.constant 0 : i32 +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C_5:.*]] = arith.constant 5 : index +// CHECK: %[[C_1:.*]] = arith.constant 1 : index +// CHECK: %[[MASK:.*]] = vector.create_mask %[[C_5]], %[[C_1]] : vector<8x1xi1> +// CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SLICE]][%[[C0]], %[[C0]]], %[[PAD]] : tensor<5x1xi32>, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32> +// CHECK: %[[C_0:.*]] = arith.constant 0 : index +// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ]], %[[INIT]][%[[C_0]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32> +// CHECK: return %[[RES]] : tensor<5x3xi32> + + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op + transform.yield + } + } + +// ----- + +// One of the _source_ dimensions is dynamic (but _destination_ dimensions are static). + +func.func private @insert_slice_dynamic_src_dim(%source: tensor, %size: index) -> tensor<5x3xi32> { + %c2 = arith.constant 2 : index + %init = tensor.empty() : tensor<5x3xi32> + + %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, %size, 1] [1, 1, 1, 1] : tensor to tensor + %res = tensor.insert_slice %source_slice into %init[0, %c2] [%size, 1] [1, 1] : tensor into tensor<5x3xi32> + + return %res : tensor<5x3xi32> +} + +// CHECK-LABEL: func.func private @insert_slice_dynamic_src_dim( +// CHECK-SAME: %[[SRC:.*]]: tensor, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor<5x3xi32> { +// CHECK: %[[C_2:.*]] = arith.constant 2 : index +// CHECK: %[[INIT:.*]] = tensor.empty() : tensor<5x3xi32> +// CHECK: %[[SRC_SLICE:.*]] = tensor.extract_slice %[[SRC]][0, %[[C_2]], 0, 0] [1, 1, %[[SIZE]], 1] [1, 1, 1, 1] : tensor to tensor +// CHECK-DAG: %[[PAD:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[C_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index +// CHECK: %[[MASK:.*]] = vector.create_mask %[[SIZE]], %[[C_1]] : vector<8x1xi1> +// CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SLICE]][%[[C_0]], %[[C_0]]], %[[PAD]] : tensor, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32> +// CHECK: %[[C_0_1:.*]] = arith.constant 0 : index +// CHECK: %[[RES:.*]] = vector.transfer_write %[[READ]], %[[INIT]][%[[C_0_1]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32> +// CHECK: return %[[RES]] : tensor<5x3xi32> + + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op + transform.yield + } }