diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index e9f8437d7c102..91d6b2a5ead9b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1010,21 +1010,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { let summary = "Convert the layout of the input operand"; let description = [{ - `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying - the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such - as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is - lowered to WI level because that is the end result of all distributions. + `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to + the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming + scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once + the IR is lowered to WI level because that is the end result of all distributions. }]; - let arguments = (ins XeGPU_Vector2DType: $source, - XeGPU_LayoutAttr: $srcMap, - XeGPU_LayoutAttr: $resMap - ); - let results = (outs XeGPU_Vector2DType: $result); + let arguments = (ins XeGPU_VectorType: $source, + XeGPU_LayoutAttr: $input_layout, + XeGPU_LayoutAttr: $target_layout); + let results = (outs XeGPU_VectorType: $result); let assemblyFormat = [{ - $source attr-dict `:` type($source) + $source prop-dict attr-dict `:` type($source) }]; + let hasFolder = 1; let hasVerifier = 1; + let hasCanonicalizer = 1; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 1f4e817dc549c..20916ae9ef830 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -22,7 +22,7 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>; def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>; def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>; -def XeGPU_Vector2DType: FixedVectorOfRankAndType<[2], [XeGPU_ScalarType]>; +def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index e0046d2c9a37a..704deeaa1f26b 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -786,32 +786,55 @@ LogicalResult DpasOp::verify() { // XeGPU_ConvertLayoutOp //===----------------------------------------------------------------------===// LogicalResult ConvertLayoutOp::verify() { - auto srcMap = getSrcMapAttr(); - auto resMap = getResMapAttr(); - if (!srcMap) - return emitOpError("expected srcMap."); - if (!resMap) - return emitOpError("expected resMap."); - - if (srcMap == resMap) - return emitOpError("expected different srcMap and resMap."); - - // both srcMap and resMap should be WgLayout or SgLayout at the same time. - if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) && - (!srcMap.isSgLayout() || !resMap.isSgLayout())) - return emitOpError( - "expected srcMap and resMap be WgLayout or SgLayout at the same time."); + auto srcLayout = getInputLayout(); + auto resLayout = getTargetLayout(); + if (!srcLayout) + return emitOpError("expected input layout."); + if (!resLayout) + return emitOpError("expected target layout."); + + // both input and target layouts should be WgLayout or SgLayout at the same + // time. + if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) && + (!srcLayout.isSgLayout() || !resLayout.isSgLayout())) + return emitOpError("expected input layout and target layout be WgLayout or " + "SgLayout at the same time."); auto shape = getSource().getType().getShape(); - if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap)) - return emitOpError("invalid srcMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout)) + return emitOpError( + "invalid input layout, data cannot be evenly distributed."); - if (!XeGPUDialect::isEvenlyDistributable(shape, resMap)) - return emitOpError("invalid resMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout)) + return emitOpError( + "invalid target layout, data cannot be evenly distributed."); return mlir::success(); } +OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { + if (getInputLayout() == getTargetLayout()) + return getSource(); + return {}; +} + +struct FoldConvertLayoutOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + if (op.getInputLayout() == op.getTargetLayout()) { + rewriter.replaceOp(op, op.getSource()); + return success(); + } + return failure(); + } +}; + +void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, + MLIRContext *context) { + patterns.add(context); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 0c55588eda7a3..4656f112958b8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -76,6 +76,29 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } +// This pattern lowers ConvertLayoutOp by removing the inst_data field from the +// layout attributes. Since both producer and consumer operations handle data +// partitioning based on their own inst_data, while maintaining original input +// and output shape, ConvertLayoutOp does not need to manage inst_data. +struct ConvertLayoutOpPattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + xegpu::LayoutAttr input_layout = op.getInputLayoutAttr(); + xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr(); + if (!input_layout.getInstData() || !target_layout.getInstData()) + return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); + + input_layout = input_layout.dropInstData(); + target_layout = target_layout.dropInstData(); + auto newOp = rewriter.createOrFold( + op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); + rewriter.replaceOp(op, newOp); + return success(); + } +}; + //===------------------------------------------------------------------------===// // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops // to partition operations that process large shapes into multiple operations on @@ -331,6 +354,7 @@ void XeGPUBlockingPass::runOnOperation() { }); RewritePatternSet patterns(ctx); + patterns.add(ctx); vector::UnrollVectorOptions vectorOptions; vectorOptions.setNativeShapeFn(options.nativeShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index b0f5bcaf90864..ef52323a9f46b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; // Calculate offset for each subgroup - SmallVector + static SmallVector calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, const SmallVector &originalOffsets, const SmallVector &localOffset, const SmallVector &distUnitBaseAddr, - const SmallVector &distUnitShape) const { + const SmallVector &distUnitShape) { assert(localOffset.size() == distUnitBaseAddr.size() && "localOffset and distUnitBaseAddr must have the same rank"); @@ -466,6 +466,75 @@ struct WgToSgElementwiseOp : public ConversionPattern { } }; +// clang-format off +// Pattern for lowering ConvertLayoutOp based on sg_layout and sg_data. +// If input_layout and target_layout have identical sg_layout and sg_data, +// the op is rewritten to a subgroup-level ConvertLayoutOp with these fields +// dropped. For example: +// #a = #xegpu.layout +// #b = #xegpu.layout +// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32> +// becomes: +// #a = #xegpu.layout +// #b = #xegpu.layout +// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<16x16xf32> +// (vector<16x16xf32> is determined by sg_data = [16, 16]) +// +// If sg_layout or sg_data differ, SLM is used to redistribute data across subgroups. +// For example: +// #a = #xegpu.layout +// #b = #xegpu.layout +// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32> +// is lowered to: +// #a = #xegpu.layout +// #b = #xegpu.layout +// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, matrix_desc<32x64xf32> +// %d = load_matrix %slm <{layout_result_0 = #a}> : matrix_desc<32x64xf32> -> vector<16x32xf32> +// xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32> +// clang-format on +struct WgToSgConvertLayoutOp + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + xegpu::LayoutAttr input = op.getInputLayout(); + xegpu::LayoutAttr target = op.getTargetLayout(); + + if (!input || !target || !input.isWgLayout() || !target.isWgLayout()) + return rewriter.notifyMatchFailure( + op, "Input and target layouts must have subgroup layout"); + + DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); + DenseI32ArrayAttr inputSgData = input.getSgData(); + DenseI32ArrayAttr inputOrder = input.getOrder(); + DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); + DenseI32ArrayAttr targetSgData = target.getSgData(); + DenseI32ArrayAttr targetOrder = target.getOrder(); + + // TODO: currently we only support for optimal case, where input and + // output has the same sg_layout and sg_data, so SLM is not involved. + if (inputSgLayout != targetSgLayout || inputSgData != targetSgData || + inputOrder != targetOrder) + return failure(); + + input = input.dropSgLayoutAndData(); + target = target.dropSgLayoutAndData(); + + SmallVector newOps(adaptor.getSource()); + if (input && target) { + // keep the ConvertLayoutOp for rest fields, e.g., inst_data. + for (auto [i, src] : llvm::enumerate(adaptor.getSource())) { + auto newOp = rewriter.create( + op.getLoc(), src.getType(), src, input, target); + newOps[i] = newOp; + } + } + rewriter.replaceOpWithMultiple(op, {newOps}); + return success(); + } +}; + // Handles UnrealizedConversionCastOp generated during // SCFStructuralTypeConversions (step 1). This op may appear as either a // target or source materialization for Vector values, e.g.: @@ -550,7 +619,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); + WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp>( + patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -662,6 +732,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(xegpu::getLayoutAttr(op.getResult())); }); + target.addDynamicallyLegalOp( + [=](xegpu::ConvertLayoutOp op) -> bool { + return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); + }); + target.addDynamicallyLegalDialect( [=](Operation *op) -> std::optional { // Only handle elementwise mappable ops diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 66a2f03da71b2..21ba96eaeb0f8 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); + // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr + if (auto convertOp = dyn_cast(defOp)) + return convertOp.getTargetLayoutAttr(); + // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) return getLayoutAttr(loadNd.getTensorDesc()); @@ -137,7 +141,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { auto parentOp = arg.getOwner()->getParentOp(); if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); - return getLayoutAttr(tiedInit->get()); + if (tiedInit) + return getLayoutAttr(tiedInit->get()); } } diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 516c2158cb0f8..0160bfee07bf2 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -548,19 +548,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto return } -// ----- -func.func @convert_layout_same_map(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected different srcMap and resMap}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> - gpu.return -} - // ----- func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + // expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}} + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir index 7f3ebec225cdf..017dacc8d629a 100644 --- a/mlir/test/Dialect/XeGPU/layout.mlir +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -35,14 +35,18 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { } gpu.func @convert_layout(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + // CHECK: xegpu.convert_layout + // CHECK-SAME: <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + // CHECK: xegpu.convert_layout + // CHECK-SAME: <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index e820e13f09f64..d986e5bd1cfb4 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -500,3 +500,64 @@ gpu.module @test_kernel { gpu.return } } + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#c = #xegpu.layout + +gpu.module @test_kernel { + //CHECK-LABEL: gpu.func @convert_layout + //CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32> + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[c0:%.+]] = arith.constant 0 : index + //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> + //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> + //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> + //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + + gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) { + %c0 = arith.constant 0 : index + %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> + %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> + xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> + gpu.return + } +} + +// ----- + +#lb = #xegpu.layout +#b = #xegpu.layout + +gpu.module @test_kernel { + //CHECK: gpu.func @convert_layout([[arg0:%.+]]: vector<8x32x2xf16>) -> vector<8x32x2xf16> { + //CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<8x32x2xf16> + //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16> + //CHECK: [[m1:%.+]] = math.exp [[e1]] {layout_result_0 = #xegpu.layout} : vector<8x16x2xf16> + //CHECK: [[r1:%.+]] = vector.insert_strided_slice [[m1]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16> + //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 16, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16> + //CHECK: [[m2:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout} : vector<8x16x2xf16> + //CHECK: [[r2:%.+]] = vector.insert_strided_slice [[m2]], [[r1]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16> + //CHECK: gpu.return [[r2]] : vector<8x32x2xf16> + + gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> { + %b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16> + %e = math.exp %b {layout_result_0 = #b} : vector<8x32x2xf16> + gpu.return %e : vector<8x32x2xf16> + } +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 8a880068aab33..d67bdb487d8bf 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -115,7 +115,7 @@ gpu.module @test_round_robin_assignment { // CHECK-SAME-COUNT-3: {layout_result_0 = #xegpu.layout} // CHECK-SAME-COUNT-3: : vector<2x1xf32> to vector<2x4xf32> // CHECK-NOT: vector.broadcast - %broadcast = vector.broadcast %load + %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<24x1xf32> to vector<24x8xf32> gpu.return @@ -215,4 +215,14 @@ gpu.module @test_round_robin_assignment { xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> gpu.return } + + gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> + //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf32> + gpu.return + } }