From 9d0341dae9da51c916405c6e14adf4dac9d20af5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 21 Oct 2025 18:49:43 +0000 Subject: [PATCH 01/18] add pass --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 10 +++ .../Dialect/XeGPU/Transforms/Transforms.h | 3 +- .../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 + .../Transforms/XeGPUOptimizeTranspose.cpp | 69 +++++++++++++++++++ 4 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 564d9c4d5422b..d0185159f16ac 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -80,4 +80,14 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> { "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"]; } +def XeGPUOptimizeTranspose : Pass<"xegpu-optimize-transpose"> { + let summary = "Optimize XeGPU loadNd operations feeding into vector.transpose"; + let description = [{ + This pass rewrites XeGPU loadNd operations that feed into vector.transpose + into more optimal forms to improve performance. + }]; + let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", + "vector::VectorDialect"]; +} + #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index a480195eebd00..cab1598457cbe 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -61,7 +61,8 @@ struct UnrollOptions { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); - +/// Appends patterns for optimizing transpose operations into `patterns`. +void populateXeGPUOptimizeTransposePatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index e6f76067094ce..a2ecbc3374ba9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUWgToSgDistribute.cpp XeGPUPropagateLayout.cpp XeGPUVectorLinearize.cpp + XeGPUOptimizeTranspose.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp new file mode 100644 index 0000000000000..69ba9e6fca7ba --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -0,0 +1,69 @@ +//===- XeGPUOptimizeTranspose.cpp - XeGPU optimize transpose ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUOPTIMIZETRANSPOSE +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-optimize-transpose" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; + +namespace { + +class XeGPULoadNdPattern final : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::LoadNdOp loadOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + return success(); + } +}; +} // namespace + +void xegpu::populateXeGPUOptimizeTransposePatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +namespace { + +struct XeGPUOptimizeTransposePass final + : public xegpu::impl::XeGPUOptimizeTransposeBase< + XeGPUOptimizeTransposePass> { + void runOnOperation() override { + MLIRContext &context = getContext(); + TypeConverter converter; + RewritePatternSet patterns(&context); + ConversionTarget target(context); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + xegpu::populateXeGPUOptimizeTransposePatterns(patterns); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + DBGS() << "Optimize transpose pass failed.\n"; + return signalPassFailure(); + } + } +}; + +} // namespace From 76f7323b526079c585dda525d2fa7b3974ed4a49 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Oct 2025 15:20:08 +0000 Subject: [PATCH 02/18] save work --- .../Transforms/XeGPUOptimizeTranspose.cpp | 62 +++++++++++++++++-- .../Dialect/XeGPU/optimize-transpose.mlir | 18 ++++++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 mlir/test/Dialect/XeGPU/optimize-transpose.mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index 69ba9e6fca7ba..83edbad3afce2 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -12,8 +12,10 @@ #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include namespace mlir { namespace xegpu { @@ -29,12 +31,28 @@ using namespace mlir; namespace { -class XeGPULoadNdPattern final : public OpConversionPattern { +static std::optional> +get2DLaneData(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneData = layout.getEffectiveLaneDataAsInt(); + if (laneData.size() != 2) + return std::nullopt; + return laneData; +} + +class XeGPUCreateNdDescOpPattern final + : public OpConversionPattern { public: - using OpConversionPattern::OpConversionPattern; + using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(xegpu::LoadNdOp loadOp, OpAdaptor adaptor, + matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + auto tdescTy = createNdOp.getType(); + auto convertType = this->getTypeConverter()->convertType(tdescTy); + if (convertType == tdescTy) + return failure(); return success(); } }; @@ -42,7 +60,7 @@ class XeGPULoadNdPattern final : public OpConversionPattern { void xegpu::populateXeGPUOptimizeTransposePatterns( RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } namespace { @@ -55,6 +73,42 @@ struct XeGPUOptimizeTransposePass final TypeConverter converter; RewritePatternSet patterns(&context); ConversionTarget target(context); + + target.addDynamicallyLegalOp( + [](xegpu::CreateNdDescOp createNdOp) { + auto optionalLaneData = get2DLaneData(createNdOp.getType()); + if (!optionalLaneData) + return true; + auto laneData = optionalLaneData.value(); + return laneData[0] != 1 || laneData[1] == 1; + }); + + converter.addConversion([](xegpu::TensorDescType tdescType) { + auto optionalLaneData = get2DLaneData(tdescType); + if (!optionalLaneData) + return tdescType; + auto laneData = optionalLaneData.value(); + int64_t innerLaneData = laneData[1]; + if (laneData[0] == 1 && innerLaneData != 1) { + int elementTyBitwidth = + tdescType.getElementType().getIntOrFloatBitWidth(); + assert(elementTyBitwidth < 32 && + "Expected element type bitwidth < 32 with laneData[1] != 1"); + SmallVector newShape(tdescType.getShape()); + newShape.back() = newShape.back() / innerLaneData; + Type newElemTy = IntegerType::get(tdescType.getContext(), + elementTyBitwidth * innerLaneData); + xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( + tdescType.getContext(), + tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); + return xegpu::TensorDescType::get( + newShape, newElemTy, tdescType.getArrayLength(), + tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), + newLayout); + } + return tdescType; + }); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); xegpu::populateXeGPUOptimizeTransposePatterns(patterns); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir new file mode 100644 index 0000000000000..529211e286902 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -0,0 +1,18 @@ +func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%c0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%c0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #xegpu.layout} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + return +} From 43c35bef094cd8754f332be17689dbef9e1c8516 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 22 Oct 2025 22:35:50 +0000 Subject: [PATCH 03/18] add some tests --- .../Transforms/XeGPUOptimizeTranspose.cpp | 188 +++++++++++++++--- .../Dialect/XeGPU/optimize-transpose.mlir | 119 ++++++++++- 2 files changed, 264 insertions(+), 43 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index 83edbad3afce2..9792da7d6ce79 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -6,13 +6,19 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include @@ -42,6 +48,46 @@ get2DLaneData(xegpu::TensorDescType tdescType) { return laneData; } +static xegpu::TensorDescType +getModifiedTensorDescType(xegpu::TensorDescType tdescType) { + auto optionalLaneData = get2DLaneData(tdescType); + if (!optionalLaneData) + return tdescType; + auto laneData = optionalLaneData.value(); + int64_t innerLaneData = laneData[1]; + if (laneData[0] == 1 && innerLaneData != 1) { + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + assert(elementTyBitwidth < 32 && + "Expected element type bitwidth < 32 with laneData[1] != 1"); + SmallVector newShape(tdescType.getShape()); + newShape.back() = newShape.back() / innerLaneData; + Type newElemTy = IntegerType::get(tdescType.getContext(), + elementTyBitwidth * innerLaneData); + xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( + tdescType.getContext(), + tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); + return xegpu::TensorDescType::get( + newShape, newElemTy, tdescType.getArrayLength(), + tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), newLayout); + } + return tdescType; +} + +static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, + OpFoldResult ofr) { + std::optional mayBeInt = getConstantIntValue(ofr); + if (mayBeInt) + return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt); + return llvm::cast(ofr); +} + +static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, + Value val, int64_t constant) { + auto constantOp = arith::ConstantIndexOp::create(rewriter, loc, constant); + return arith::DivUIOp::create(rewriter, loc, val, constantOp.getResult()) + .getResult(); +} + class XeGPUCreateNdDescOpPattern final : public OpConversionPattern { public: @@ -50,17 +96,106 @@ class XeGPUCreateNdDescOpPattern final matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto tdescTy = createNdOp.getType(); - auto convertType = this->getTypeConverter()->convertType(tdescTy); + auto convertType = getModifiedTensorDescType(tdescTy); if (convertType == tdescTy) return failure(); + auto strides = createNdOp.getMixedStrides(); + auto maybeConstInnerStride = getConstantIntValue(strides.back()); + // Only row-major memrefs are expected for now. + if (!maybeConstInnerStride || *maybeConstInnerStride != 1) + return failure(); + Value source = createNdOp.getSource(); + auto optionalLaneData = get2DLaneData(tdescTy); + assert(optionalLaneData && "Expected 2D lane data"); + auto laneData = optionalLaneData.value(); + int64_t innerLaneData = laneData[1]; + auto memrefType = dyn_cast(source.getType()); + // Inner dimension of the shape must be adjusted based on innerLaneData. + SmallVector modifiedShape(createNdOp.getMixedSizes()); + modifiedShape.back() = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()), + innerLaneData); + // Similarly, second to last stride must be adjusted. + assert(strides.size() >= 2 && + "Expected at least 2 strides for CreateNdDescOp"); + SmallVector modifiedStrides(strides); + modifiedStrides[modifiedStrides.size() - 2] = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), + modifiedStrides[modifiedStrides.size() - 2]), + innerLaneData); + + // If the source is a static memref, we need to extract the pointer to + // base address. + if (memrefType && memrefType.hasStaticShape()) { + auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, createNdOp.getLoc(), source); + source = arith::IndexCastOp::create( + rewriter, createNdOp.getLoc(), + IntegerType::get(rewriter.getContext(), 64), + extractOp.getResult()) + .getResult(); + } + // Create a new CreateNdDescOp with the modified shape and converted type. + auto newCreateNdDescOp = xegpu::CreateNdDescOp::create( + rewriter, createNdOp.getLoc(), convertType, source, modifiedShape, + modifiedStrides); + rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult()); return success(); } }; } // namespace +class XeGPULoadNdDescOpPattern final + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto origTensorDescType = loadNdOp.getTensorDescType(); + auto adaptorType = + cast(adaptor.getTensorDesc().getType()); + if (adaptorType == origTensorDescType) + return failure(); + // Offsets must be adjusted based on innerLaneData. + auto optionalLaneData = get2DLaneData(loadNdOp.getTensorDescType()); + assert(optionalLaneData && "Expected 2D lane data"); + int64_t innerLaneData = optionalLaneData.value()[1]; + auto offsets = loadNdOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(loadNdOp, + "Expecting offsets in LoadNd"); + SmallVector modifiedOffsets(offsets); + modifiedOffsets.back() = divideByConstant( + rewriter, loadNdOp.getLoc(), + convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()), + innerLaneData); + VectorType modifiedType = + VectorType::get(adaptorType.getShape(), adaptorType.getElementType()); + // Create a new LoadNdOp with modified offsets and type. + auto newLoadNdOp = xegpu::LoadNdOp::create( + rewriter, loadNdOp->getLoc(), modifiedType, adaptor.getTensorDesc(), + modifiedOffsets, loadNdOp.getPackedAttr(), loadNdOp.getTransposeAttr(), + loadNdOp.getL1HintAttr(), loadNdOp.getL2HintAttr(), + loadNdOp.getL3HintAttr()); + // Bitcast back to the original type. + auto castOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + loadNdOp.getType(), newLoadNdOp); + // Cast op must have the same layout as the original LoadNdOp result. + xegpu::setDistributeLayoutAttr( + castOp->getOpResult(0), + xegpu::getDistributeLayoutAttr(loadNdOp.getResult())); + rewriter.replaceOp(loadNdOp, castOp.getResult()); + return success(); + } +}; + void xegpu::populateXeGPUOptimizeTransposePatterns( RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add( + patterns.getContext()); } namespace { @@ -74,41 +209,28 @@ struct XeGPUOptimizeTransposePass final RewritePatternSet patterns(&context); ConversionTarget target(context); + auto checkValidInnerLaneData = + [](std::optional> optionalLaneData) -> bool { + if (!optionalLaneData) + return true; + auto laneData = optionalLaneData.value(); + return laneData[0] != 1 || laneData[1] == 1; + }; + target.addDynamicallyLegalOp( - [](xegpu::CreateNdDescOp createNdOp) { + [&](xegpu::CreateNdDescOp createNdOp) { auto optionalLaneData = get2DLaneData(createNdOp.getType()); - if (!optionalLaneData) - return true; - auto laneData = optionalLaneData.value(); - return laneData[0] != 1 || laneData[1] == 1; + return checkValidInnerLaneData(optionalLaneData); }); + target.addDynamicallyLegalOp( + [&](xegpu::LoadNdOp loadNdOp) { + auto optionalLaneData = get2DLaneData(loadNdOp.getTensorDescType()); + return checkValidInnerLaneData(optionalLaneData); + }); + converter.addConversion([](Type type) { return type; }); - converter.addConversion([](xegpu::TensorDescType tdescType) { - auto optionalLaneData = get2DLaneData(tdescType); - if (!optionalLaneData) - return tdescType; - auto laneData = optionalLaneData.value(); - int64_t innerLaneData = laneData[1]; - if (laneData[0] == 1 && innerLaneData != 1) { - int elementTyBitwidth = - tdescType.getElementType().getIntOrFloatBitWidth(); - assert(elementTyBitwidth < 32 && - "Expected element type bitwidth < 32 with laneData[1] != 1"); - SmallVector newShape(tdescType.getShape()); - newShape.back() = newShape.back() / innerLaneData; - Type newElemTy = IntegerType::get(tdescType.getContext(), - elementTyBitwidth * innerLaneData); - xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( - tdescType.getContext(), - tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); - return xegpu::TensorDescType::get( - newShape, newElemTy, tdescType.getArrayLength(), - tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), - newLayout); - } - return tdescType; - }); - + target.addLegalDialect(); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); xegpu::populateXeGPUOptimizeTransposePatterns(patterns); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 529211e286902..0bc74d75a4ad0 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -1,18 +1,117 @@ + + +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + return %6 : vector<8x16xf32> +} + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b> + %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> + return %6 : vector<8x16xi32> +} + + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c256 = arith.constant 256 : index - %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> - %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { - %5 = xegpu.load_nd %2[%c0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %6 = xegpu.load_nd %3[%c0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %7 = vector.transpose %6, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> - %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %8 : vector<8x16xf32> - } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + return +} + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + return +} + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + scf.for %arg8 = %c0 to %c256 step %c16 { + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + } return } From f79d2a2b68fc337fe431196c08c7b3e1b52a0256 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 23 Oct 2025 17:46:55 +0000 Subject: [PATCH 04/18] add some tests --- .../Transforms/XeGPUOptimizeTranspose.cpp | 15 ++++++ .../Dialect/XeGPU/optimize-transpose.mlir | 50 +++++++++---------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index 9792da7d6ce79..e0a9e3c221400 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -35,6 +35,21 @@ namespace xegpu { using namespace mlir; +struct TransposableBlockRange { + int minWidth, maxWidth, minHeight, maxHeight; +}; + +// TODO: Use uArch to get supported block ranges. +static TransposableBlockRange getBlockRange(int bitWidth) { + switch (bitWidth) { + case 32: + return {/**min width**/ 1, /**max width**/ 8, /**min height**/ 1, + /**max height**/ 32}; + default: + llvm_unreachable("Add support for other element bitwidths"); + } +} + namespace { static std::optional> diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 0bc74d75a4ad0..35c9294c32f04 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -51,6 +51,31 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 return } +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + scf.for %arg8 = %c0 to %c256 step %c16 { + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + } + return +} + // ----- #a = #xegpu.layout #b = #xegpu.layout @@ -90,28 +115,3 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> return } - -// ----- -#a = #xegpu.layout -#b = #xegpu.layout -#bt = #xegpu.layout -func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c256 = arith.constant 256 : index - scf.for %arg8 = %c0 to %c256 step %c16 { - %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> - %1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> - %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> - %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> - %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { - %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> - %6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> - %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> - %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield %8 : vector<8x16xf32> - } {layout_result_0 = #a} - xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> - } - return -} From ca5d9024135f3e49c79e3dc503cd715bca4dc38d Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 24 Oct 2025 23:37:14 +0000 Subject: [PATCH 05/18] save work --- .../Transforms/XeGPUOptimizeTranspose.cpp | 265 +++++++++++++----- .../Dialect/XeGPU/optimize-transpose.mlir | 44 +++ 2 files changed, 246 insertions(+), 63 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index e0a9e3c221400..a697c0433ed56 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -9,18 +9,23 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/IR/BuiltinAttributeInterfaces.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/Types.h" #include "mlir/IR/Value.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/SmallVector.h" +#include #include namespace mlir { @@ -35,12 +40,14 @@ namespace xegpu { using namespace mlir; -struct TransposableBlockRange { - int minWidth, maxWidth, minHeight, maxHeight; +namespace { + +struct Allowed2DShapeRange { + int64_t minWidth, maxWidth, minHeight, maxHeight; }; // TODO: Use uArch to get supported block ranges. -static TransposableBlockRange getBlockRange(int bitWidth) { +static Allowed2DShapeRange getTransposableBlockRange(int bitWidth) { switch (bitWidth) { case 32: return {/**min width**/ 1, /**max width**/ 8, /**min height**/ 1, @@ -50,10 +57,8 @@ static TransposableBlockRange getBlockRange(int bitWidth) { } } -namespace { - static std::optional> -get2DLaneData(xegpu::TensorDescType tdescType) { +getMaybeLaneData(xegpu::TensorDescType tdescType) { auto layout = tdescType.getLayoutAttr(); if (!layout) return std::nullopt; @@ -63,44 +68,131 @@ get2DLaneData(xegpu::TensorDescType tdescType) { return laneData; } +static std::optional> +getMaybeLaneLayout(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (laneLayout.size() != 2) + return std::nullopt; + return laneLayout; +} + +// A transpose layout is invalid if lane layout is transposed (lane[0] != 1 && +// lane[1] == 1), but inner lane data is not equal to [1, 1]. +static bool hasInvalidTranposeLayout(xegpu::TensorDescType tdescType) { + // If the dtype is greater or equal to 32 bits, layout must be valid. + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + if (elementTyBitwidth >= 32) + return false; + auto maybeLaneLayout = getMaybeLaneLayout(tdescType); + auto maybeLaneData = getMaybeLaneData(tdescType); + if (!maybeLaneData || !maybeLaneLayout) + return false; + auto laneData = maybeLaneData.value(); + auto laneLayout = maybeLaneLayout.value(); + if (laneLayout[0] == 1 || laneLayout[1] != 1) + return false; + if (laneData[0] != 1 || laneData[1] == 1) + return false; + return true; +} + static xegpu::TensorDescType -getModifiedTensorDescType(xegpu::TensorDescType tdescType) { - auto optionalLaneData = get2DLaneData(tdescType); - if (!optionalLaneData) +tryConvertToTransposable(xegpu::TensorDescType tdescType) { + if (!hasInvalidTranposeLayout(tdescType)) return tdescType; - auto laneData = optionalLaneData.value(); + auto laneData = getMaybeLaneData(tdescType).value(); int64_t innerLaneData = laneData[1]; - if (laneData[0] == 1 && innerLaneData != 1) { - int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); - assert(elementTyBitwidth < 32 && - "Expected element type bitwidth < 32 with laneData[1] != 1"); - SmallVector newShape(tdescType.getShape()); - newShape.back() = newShape.back() / innerLaneData; - Type newElemTy = IntegerType::get(tdescType.getContext(), - elementTyBitwidth * innerLaneData); - xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( - tdescType.getContext(), - tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); - return xegpu::TensorDescType::get( - newShape, newElemTy, tdescType.getArrayLength(), - tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), newLayout); - } - return tdescType; + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + // Required shape is total shape of the vector result that this tensor desc + // must eventually load after adjusting for the new bitwidth and array + // length. + SmallVector requiredShape(tdescType.getShape()); + requiredShape.back() = + requiredShape.back() * tdescType.getArrayLength() / innerLaneData; + int newBitWidth = elementTyBitwidth * innerLaneData; + Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth); + // Supported shape is the max transpose shape that can be supported by + // hardware that is less than or equal to required shape. + auto supportedHeight = std::min( + requiredShape[0], getTransposableBlockRange(newBitWidth).maxHeight); + auto supportedWidth = std::min( + requiredShape[1], getTransposableBlockRange(newBitWidth).maxWidth); + SmallVector supportedShape = {supportedHeight, supportedWidth}; + + // Required shape must be multiple of supported shape. Otherwise, we can not + // optimize it. + // TODO: Supported shape can be adjusted to handle non-multiple cases. + if (requiredShape[0] % supportedShape[0] != 0 || + requiredShape[1] % supportedShape[1] != 0) + return tdescType; + + xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( + tdescType.getContext(), + tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); + // Array length can not be larger than 1 for transpose case. + return xegpu::TensorDescType::get( + supportedShape, newElemTy, /**array length**/ 1, + tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), newLayout); +} + +static Value createConstantIndex(ConversionPatternRewriter &rewriter, + Location loc, int64_t value) { + return arith::ConstantIndexOp::create(rewriter, loc, value).getResult(); } static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, OpFoldResult ofr) { std::optional mayBeInt = getConstantIntValue(ofr); if (mayBeInt) - return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt); + return createConstantIndex(rewriter, loc, *mayBeInt); return llvm::cast(ofr); } static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, Value val, int64_t constant) { - auto constantOp = arith::ConstantIndexOp::create(rewriter, loc, constant); - return arith::DivUIOp::create(rewriter, loc, val, constantOp.getResult()) - .getResult(); + auto constantOp = createConstantIndex(rewriter, loc, constant); + return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult(); +} + +static Value generateLoads(ConversionPatternRewriter &rewriter, + TypedValue data, + SmallVector &shapeRatio, + SmallVector offsets, + SmallVector &supportedShape, + TypedValue newTensorDesc, + xegpu::LoadNdOp origLoadOp) { + Location loc = data.getLoc(); + assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); + Value offsetX = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); + Value offsetY = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + for (int64_t h = 0; h < shapeRatio[0]; ++h) { + for (int64_t w = 0; w < shapeRatio[1]; ++w) { + int64_t localOffsetX = h * supportedShape[0]; + int64_t localOffsetY = w * supportedShape[1]; + Value loadOffsetX = arith::AddIOp::create( + rewriter, loc, offsetX, + createConstantIndex(rewriter, loc, localOffsetX)); + Value loadOffsetY = arith::AddIOp::create( + rewriter, loc, offsetY, + createConstantIndex(rewriter, loc, localOffsetY)); + auto loadOp = xegpu::LoadNdOp::create( + rewriter, loc, + VectorType::get(supportedShape, data.getType().getElementType()), + newTensorDesc, ArrayRef{loadOffsetX, loadOffsetY}, + origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(), + origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(), + origLoadOp.getL3HintAttr()); + // Insert the loaded block into the right position in data. + data = vector::InsertStridedSliceOp::create( + rewriter, loc, loadOp.getResult(), data, + ArrayRef{localOffsetX, localOffsetY}, + ArrayRef{1, 1}); + } + } + return data; } class XeGPUCreateNdDescOpPattern final @@ -111,7 +203,7 @@ class XeGPUCreateNdDescOpPattern final matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto tdescTy = createNdOp.getType(); - auto convertType = getModifiedTensorDescType(tdescTy); + auto convertType = tryConvertToTransposable(tdescTy); if (convertType == tdescTy) return failure(); auto strides = createNdOp.getMixedStrides(); @@ -120,7 +212,7 @@ class XeGPUCreateNdDescOpPattern final if (!maybeConstInnerStride || *maybeConstInnerStride != 1) return failure(); Value source = createNdOp.getSource(); - auto optionalLaneData = get2DLaneData(tdescTy); + auto optionalLaneData = getMaybeLaneData(tdescTy); assert(optionalLaneData && "Expected 2D lane data"); auto laneData = optionalLaneData.value(); int64_t innerLaneData = laneData[1]; @@ -160,7 +252,6 @@ class XeGPUCreateNdDescOpPattern final return success(); } }; -} // namespace class XeGPULoadNdDescOpPattern final : public OpConversionPattern { @@ -175,9 +266,8 @@ class XeGPULoadNdDescOpPattern final if (adaptorType == origTensorDescType) return failure(); // Offsets must be adjusted based on innerLaneData. - auto optionalLaneData = get2DLaneData(loadNdOp.getTensorDescType()); - assert(optionalLaneData && "Expected 2D lane data"); - int64_t innerLaneData = optionalLaneData.value()[1]; + auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value(); + int64_t innerLaneData = laneData[1]; auto offsets = loadNdOp.getMixedOffsets(); if (offsets.empty()) return rewriter.notifyMatchFailure(loadNdOp, @@ -187,25 +277,82 @@ class XeGPULoadNdDescOpPattern final rewriter, loadNdOp.getLoc(), convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()), innerLaneData); - VectorType modifiedType = - VectorType::get(adaptorType.getShape(), adaptorType.getElementType()); - // Create a new LoadNdOp with modified offsets and type. - auto newLoadNdOp = xegpu::LoadNdOp::create( - rewriter, loadNdOp->getLoc(), modifiedType, adaptor.getTensorDesc(), - modifiedOffsets, loadNdOp.getPackedAttr(), loadNdOp.getTransposeAttr(), - loadNdOp.getL1HintAttr(), loadNdOp.getL2HintAttr(), - loadNdOp.getL3HintAttr()); - // Bitcast back to the original type. + // Get the 2D data shape of this loadNdOp in its original type including + // array length. + SmallVector origDataShape(origTensorDescType.getShape()); + // origDataShape.back() *= origTensorDescType.getArrayLength(); + // Adjust the data shape based on innerLaneData. + origDataShape.back() /= innerLaneData; + // HW supported shape is the new tensor desc shape after conversion. + SmallVector hwSupportedShape(adaptorType.getShape()); + // Shape ratio is 2D and, it describes how many blocks need to be loaded in + // HW supported shape to cover the original shape. + auto ratio = computeShapeRatio(origDataShape, hwSupportedShape) + .value(); // ratio must be defined if we reach here. + // Create a zero-initialized vector to hold all loaded blocks. + // TypedAttr zeroAttr = rewriter.getZeroAttr(adaptorType.getElementType()); + VectorType origVectorType = + VectorType::get(origDataShape, adaptorType.getElementType()); + Value data; + // Orig data shape is 3D for the array length case. + if (origTensorDescType.getArrayLength() > 1) { + SmallVector arrayLenDataShape(origDataShape); + arrayLenDataShape.insert(arrayLenDataShape.begin(), + origTensorDescType.getArrayLength()); + auto arrayLenVecType = + VectorType::get(arrayLenDataShape, adaptorType.getElementType()); + data = arith::ConstantOp::create(rewriter, loadNdOp->getLoc(), + arrayLenVecType, + rewriter.getZeroAttr(arrayLenVecType)); + for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) { + Value slice = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), + VectorType::get(origDataShape, adaptorType.getElementType()), + rewriter.getZeroAttr(origVectorType)); + // Increse the Y offset for each array slice. + Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), + modifiedOffsets.back()); + modifiedOffsets.back() = + arith::AddIOp::create(rewriter, loadNdOp->getLoc(), offsetY, + createConstantIndex(rewriter, + loadNdOp->getLoc(), + i * origDataShape[1])) + .getResult(); + slice = generateLoads( + rewriter, cast>(slice), ratio, + modifiedOffsets, hwSupportedShape, + cast>(adaptor.getTensorDesc()), + loadNdOp); + // Insert slice to data. + data = vector::InsertOp::create(rewriter, loadNdOp->getLoc(), slice, + data, ArrayRef{i}); + } + // Cast back to the original type and replace all uses. + data = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + loadNdOp.getType(), data); + rewriter.replaceOp(loadNdOp, data); + return success(); + } + data = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), + VectorType::get(origDataShape, adaptorType.getElementType()), + rewriter.getZeroAttr(origVectorType)); + data = generateLoads( + rewriter, cast>(data), ratio, modifiedOffsets, + hwSupportedShape, + cast>(adaptor.getTensorDesc()), + loadNdOp); auto castOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), - loadNdOp.getType(), newLoadNdOp); - // Cast op must have the same layout as the original LoadNdOp result. - xegpu::setDistributeLayoutAttr( - castOp->getOpResult(0), - xegpu::getDistributeLayoutAttr(loadNdOp.getResult())); - rewriter.replaceOp(loadNdOp, castOp.getResult()); + loadNdOp.getType(), data); + // // Cast op must have the same layout as the original LoadNdOp result. + // xegpu::setDistributeLayoutAttr( + // castOp->getOpResult(0), + // xegpu::getDistributeLayoutAttr(loadNdOp.getResult())); + rewriter.replaceOp(loadNdOp, castOp); return success(); } }; +} // namespace void xegpu::populateXeGPUOptimizeTransposePatterns( RewritePatternSet &patterns) { @@ -224,23 +371,15 @@ struct XeGPUOptimizeTransposePass final RewritePatternSet patterns(&context); ConversionTarget target(context); - auto checkValidInnerLaneData = - [](std::optional> optionalLaneData) -> bool { - if (!optionalLaneData) - return true; - auto laneData = optionalLaneData.value(); - return laneData[0] != 1 || laneData[1] == 1; - }; - + // CreateNdDescOp and LoadNdOp with invalid transpose layout must be + // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { - auto optionalLaneData = get2DLaneData(createNdOp.getType()); - return checkValidInnerLaneData(optionalLaneData); + return !hasInvalidTranposeLayout(createNdOp.getType()); }); target.addDynamicallyLegalOp( [&](xegpu::LoadNdOp loadNdOp) { - auto optionalLaneData = get2DLaneData(loadNdOp.getTensorDescType()); - return checkValidInnerLaneData(optionalLaneData); + return !hasInvalidTranposeLayout(loadNdOp.getTensorDescType()); }); converter.addConversion([](Type type) { return type; }); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 35c9294c32f04..a59ab2e14e36c 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -115,3 +115,47 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> return } + +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#bt = #xegpu.layout +func.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> + -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } + : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + %19 = vector.extract %6[0] : vector<32x16xf16> from vector<2x32x16xf16> + %20 = vector.extract %6[1] : vector<32x16xf16> from vector<2x32x16xf16> + %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + return +} From 35ca92befdf5830c2b7c25cfcf4b9456ec012a8b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Sat, 25 Oct 2025 00:38:06 +0000 Subject: [PATCH 06/18] working version --- .../Transforms/XeGPUOptimizeTranspose.cpp | 69 ++++++++++++++----- 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index a697c0433ed56..c73f2e4607482 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -296,18 +296,18 @@ class XeGPULoadNdDescOpPattern final Value data; // Orig data shape is 3D for the array length case. if (origTensorDescType.getArrayLength() > 1) { - SmallVector arrayLenDataShape(origDataShape); - arrayLenDataShape.insert(arrayLenDataShape.begin(), - origTensorDescType.getArrayLength()); - auto arrayLenVecType = - VectorType::get(arrayLenDataShape, adaptorType.getElementType()); - data = arith::ConstantOp::create(rewriter, loadNdOp->getLoc(), - arrayLenVecType, - rewriter.getZeroAttr(arrayLenVecType)); + // SmallVector arrayLenDataShape(origDataShape); + // arrayLenDataShape.insert(arrayLenDataShape.begin(), + // origTensorDescType.getArrayLength()); + // auto arrayLenVecType = + // VectorType::get(arrayLenDataShape, adaptorType.getElementType()); + // auto = arith::ConstantOp::create(rewriter, loadNdOp->getLoc(), + // arrayLenVecType, + // rewriter.getZeroAttr(arrayLenVecType)); + SmallVector arraySlices; for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) { Value slice = arith::ConstantOp::create( - rewriter, loadNdOp->getLoc(), - VectorType::get(origDataShape, adaptorType.getElementType()), + rewriter, loadNdOp->getLoc(), origVectorType, rewriter.getZeroAttr(origVectorType)); // Increse the Y offset for each array slice. Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), @@ -323,14 +323,20 @@ class XeGPULoadNdDescOpPattern final modifiedOffsets, hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); - // Insert slice to data. - data = vector::InsertOp::create(rewriter, loadNdOp->getLoc(), slice, - data, ArrayRef{i}); + // // Insert slice to data. + // data = vector::InsertOp::create(rewriter, loadNdOp->getLoc(), slice, + // data, ArrayRef{i}); + // Bitcast back to original load shape without array length. + auto bitcastType = VectorType::get(origTensorDescType.getShape(), + origTensorDescType.getElementType()); + slice = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + bitcastType, slice); + arraySlices.push_back(slice); } - // Cast back to the original type and replace all uses. - data = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), - loadNdOp.getType(), data); - rewriter.replaceOp(loadNdOp, data); + // // Cast back to the original type and replace all uses. + // data = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + // loadNdOp.getType(), data); + rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); return success(); } data = arith::ConstantOp::create( @@ -352,12 +358,33 @@ class XeGPULoadNdDescOpPattern final return success(); } }; + +class VectorExtractOpPattern final + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (adaptor.getSource().size() == 1) + return failure(); + auto mixedPos = extractOp.getMixedPosition(); + if (mixedPos.size() != 1) + return failure(); + auto mayBeInt = getConstantIntValue(mixedPos[0]); + if (!mayBeInt) + return failure(); + rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]); + return success(); + } +}; + } // namespace void xegpu::populateXeGPUOptimizeTransposePatterns( RewritePatternSet &patterns) { - patterns.add( - patterns.getContext()); + patterns.add(patterns.getContext()); } namespace { @@ -381,6 +408,10 @@ struct XeGPUOptimizeTransposePass final [&](xegpu::LoadNdOp loadNdOp) { return !hasInvalidTranposeLayout(loadNdOp.getTensorDescType()); }); + target.addDynamicallyLegalOp( + [&](vector::ExtractOp extractOp) { + return extractOp.getSourceVectorType().getRank() != 3; + }); converter.addConversion([](Type type) { return type; }); target.addLegalDialect Date: Tue, 28 Oct 2025 21:42:47 +0000 Subject: [PATCH 07/18] add tests --- .../Transforms/XeGPUOptimizeTranspose.cpp | 84 +++++++------ .../Dialect/XeGPU/optimize-transpose.mlir | 114 +++++++++++++++++- 2 files changed, 154 insertions(+), 44 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index c73f2e4607482..462751e16ace1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -79,9 +79,20 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { return laneLayout; } +static bool canBeOptimized(ArrayRef laneLayout, + ArrayRef laneData) { + if (laneLayout.size() != 2 || laneData.size() != 2) + return false; + if (laneLayout[0] == 1 || laneLayout[1] != 1) + return false; + if (laneData[0] != 1 || laneData[1] == 1) + return false; + return true; +} + // A transpose layout is invalid if lane layout is transposed (lane[0] != 1 && // lane[1] == 1), but inner lane data is not equal to [1, 1]. -static bool hasInvalidTranposeLayout(xegpu::TensorDescType tdescType) { +static bool canBeOptimized(xegpu::TensorDescType tdescType) { // If the dtype is greater or equal to 32 bits, layout must be valid. int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); if (elementTyBitwidth >= 32) @@ -90,18 +101,12 @@ static bool hasInvalidTranposeLayout(xegpu::TensorDescType tdescType) { auto maybeLaneData = getMaybeLaneData(tdescType); if (!maybeLaneData || !maybeLaneLayout) return false; - auto laneData = maybeLaneData.value(); - auto laneLayout = maybeLaneLayout.value(); - if (laneLayout[0] == 1 || laneLayout[1] != 1) - return false; - if (laneData[0] != 1 || laneData[1] == 1) - return false; - return true; + return canBeOptimized(*maybeLaneLayout, *maybeLaneData); } static xegpu::TensorDescType tryConvertToTransposable(xegpu::TensorDescType tdescType) { - if (!hasInvalidTranposeLayout(tdescType)) + if (!canBeOptimized(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType).value(); int64_t innerLaneData = laneData[1]; @@ -185,11 +190,17 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(), origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(), origLoadOp.getL3HintAttr()); + // Set the layout for the loadOp. + auto layoutAttr = newTensorDesc.getType().getLayoutAttr(); + xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr); // Insert the loaded block into the right position in data. - data = vector::InsertStridedSliceOp::create( + auto insertOp = vector::InsertStridedSliceOp::create( rewriter, loc, loadOp.getResult(), data, ArrayRef{localOffsetX, localOffsetY}, ArrayRef{1, 1}); + // InsertOp must have the same layout as newTensorDesc. + xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + data = insertOp.getResult(); } } return data; @@ -288,7 +299,7 @@ class XeGPULoadNdDescOpPattern final // Shape ratio is 2D and, it describes how many blocks need to be loaded in // HW supported shape to cover the original shape. auto ratio = computeShapeRatio(origDataShape, hwSupportedShape) - .value(); // ratio must be defined if we reach here. + .value(); // `ratio` must be defined if we reach here. // Create a zero-initialized vector to hold all loaded blocks. // TypedAttr zeroAttr = rewriter.getZeroAttr(adaptorType.getElementType()); VectorType origVectorType = @@ -296,20 +307,12 @@ class XeGPULoadNdDescOpPattern final Value data; // Orig data shape is 3D for the array length case. if (origTensorDescType.getArrayLength() > 1) { - // SmallVector arrayLenDataShape(origDataShape); - // arrayLenDataShape.insert(arrayLenDataShape.begin(), - // origTensorDescType.getArrayLength()); - // auto arrayLenVecType = - // VectorType::get(arrayLenDataShape, adaptorType.getElementType()); - // auto = arith::ConstantOp::create(rewriter, loadNdOp->getLoc(), - // arrayLenVecType, - // rewriter.getZeroAttr(arrayLenVecType)); SmallVector arraySlices; for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) { Value slice = arith::ConstantOp::create( rewriter, loadNdOp->getLoc(), origVectorType, rewriter.getZeroAttr(origVectorType)); - // Increse the Y offset for each array slice. + // Increase the Y offset for each array slice. Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), modifiedOffsets.back()); modifiedOffsets.back() = @@ -323,19 +326,16 @@ class XeGPULoadNdDescOpPattern final modifiedOffsets, hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); - // // Insert slice to data. - // data = vector::InsertOp::create(rewriter, loadNdOp->getLoc(), slice, - // data, ArrayRef{i}); - // Bitcast back to original load shape without array length. + // BitCast back to original load shape without array length. auto bitcastType = VectorType::get(origTensorDescType.getShape(), origTensorDescType.getElementType()); - slice = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), - bitcastType, slice); - arraySlices.push_back(slice); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + bitcastType, slice); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + arraySlices.push_back(bitCastOp.getResult()); } - // // Cast back to the original type and replace all uses. - // data = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), - // loadNdOp.getType(), data); rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); return success(); } @@ -348,13 +348,12 @@ class XeGPULoadNdDescOpPattern final hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); - auto castOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), - loadNdOp.getType(), data); - // // Cast op must have the same layout as the original LoadNdOp result. - // xegpu::setDistributeLayoutAttr( - // castOp->getOpResult(0), - // xegpu::getDistributeLayoutAttr(loadNdOp.getResult())); - rewriter.replaceOp(loadNdOp, castOp); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + loadNdOp.getType(), data); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + rewriter.replaceOp(loadNdOp, bitCastOp); return success(); } }; @@ -402,15 +401,20 @@ struct XeGPUOptimizeTransposePass final // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { - return !hasInvalidTranposeLayout(createNdOp.getType()); + return !canBeOptimized(createNdOp.getType()); }); target.addDynamicallyLegalOp( [&](xegpu::LoadNdOp loadNdOp) { - return !hasInvalidTranposeLayout(loadNdOp.getTensorDescType()); + return !canBeOptimized(loadNdOp.getTensorDescType()); }); target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { - return extractOp.getSourceVectorType().getRank() != 3; + auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); + if (!layout) + return true; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + auto laneData = layout.getEffectiveLaneDataAsInt(); + return !canBeOptimized(laneLayout, laneData); }); converter.addConversion([](Type type) { return type; }); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index a59ab2e14e36c..7b3bd1bf8e4fe 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -1,5 +1,18 @@ +// RUN: mlir-opt -xegpu-optimize-transpose -canonicalize -split-input-file %s | FileCheck %s - +// CHECK-LABEL: func.func @no_scf( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C32:.*]] = arith.constant 32 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout @@ -14,21 +27,51 @@ func.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8 } // ----- +// CHECK-LABEL: func.func @no_scf_i8( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK: %[[T3:.*]] = vector.bitcast %[[T2]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x32xi8> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout +#c = #xegpu.layout func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b> %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8> %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8> - %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> return %6 : vector<8x16xi32> } // ----- +// CHECK-LABEL: func.func @gemm_b_transpose( +// CHECK-SAME: %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1] +// CHECK-SAME: : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout @@ -52,6 +95,24 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 } // ----- +// CHECK-LABEL: func.func @nested_scf( +// CHECK-SAME: %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: scf.for %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< +// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout @@ -77,6 +138,31 @@ func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %a } // ----- +// CHECK-LABEL: func.func @large_loads( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32> +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 0], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout, offsets = [0, 8], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<32x16xi32> to vector<32x32xf16> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout @@ -117,6 +203,26 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg } // ----- +// CHECK-LABEL: func.func @array_length( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 -> +// CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout @@ -133,8 +239,8 @@ func.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %ar -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> - %19 = vector.extract %6[0] : vector<32x16xf16> from vector<2x32x16xf16> - %20 = vector.extract %6[1] : vector<32x16xf16> from vector<2x32x16xf16> + %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> + %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } : vector<32x16xf16> to vector<16x16xf16> %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } From cbcccf632b3d086f78e93e243d3847c3cbf3b62e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 29 Oct 2025 00:12:05 +0000 Subject: [PATCH 08/18] add comments --- .../Transforms/XeGPUOptimizeTranspose.cpp | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index 462751e16ace1..385a0dd470f03 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -46,7 +46,8 @@ struct Allowed2DShapeRange { int64_t minWidth, maxWidth, minHeight, maxHeight; }; -// TODO: Use uArch to get supported block ranges. +/// Helper to get the size range of a 2D block that can be transposed by HW. +/// TODO: Use uArch to get supported block ranges. static Allowed2DShapeRange getTransposableBlockRange(int bitWidth) { switch (bitWidth) { case 32: @@ -57,6 +58,7 @@ static Allowed2DShapeRange getTransposableBlockRange(int bitWidth) { } } +/// Get the 2D lane data from a tensor desc type if it exists. static std::optional> getMaybeLaneData(xegpu::TensorDescType tdescType) { auto layout = tdescType.getLayoutAttr(); @@ -68,6 +70,7 @@ getMaybeLaneData(xegpu::TensorDescType tdescType) { return laneData; } +/// Get the 2D lane layout from a tensor desc type if it exists. static std::optional> getMaybeLaneLayout(xegpu::TensorDescType tdescType) { auto layout = tdescType.getLayoutAttr(); @@ -79,6 +82,8 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { return laneLayout; } +/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 && +/// lane[1] == 1), but inner lane data is not equal to [1, 1]. static bool canBeOptimized(ArrayRef laneLayout, ArrayRef laneData) { if (laneLayout.size() != 2 || laneData.size() != 2) @@ -90,8 +95,8 @@ static bool canBeOptimized(ArrayRef laneLayout, return true; } -// A transpose layout is invalid if lane layout is transposed (lane[0] != 1 && -// lane[1] == 1), but inner lane data is not equal to [1, 1]. +/// A tensor desc type can be optimized if its element type is less than 32 bits +/// and its layout can be optimized. static bool canBeOptimized(xegpu::TensorDescType tdescType) { // If the dtype is greater or equal to 32 bits, layout must be valid. int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); @@ -104,8 +109,9 @@ static bool canBeOptimized(xegpu::TensorDescType tdescType) { return canBeOptimized(*maybeLaneLayout, *maybeLaneData); } -static xegpu::TensorDescType -tryConvertToTransposable(xegpu::TensorDescType tdescType) { +/// Check if a tensor desc type can be optimized for transpose, if so return the +/// new optimized tensor desc type with a valid transpose layout. +static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType) { if (!canBeOptimized(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType).value(); @@ -143,11 +149,13 @@ tryConvertToTransposable(xegpu::TensorDescType tdescType) { tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), newLayout); } +/// Helper to create a constant index value. static Value createConstantIndex(ConversionPatternRewriter &rewriter, Location loc, int64_t value) { return arith::ConstantIndexOp::create(rewriter, loc, value).getResult(); } +/// Helper to convert an OpFoldResult to Value. static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, OpFoldResult ofr) { std::optional mayBeInt = getConstantIntValue(ofr); @@ -156,6 +164,7 @@ static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, return llvm::cast(ofr); } +/// Helper to divide a Value by a constant integer. static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, Value val, int64_t constant) { auto constantOp = createConstantIndex(rewriter, loc, constant); @@ -164,7 +173,6 @@ static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, static Value generateLoads(ConversionPatternRewriter &rewriter, TypedValue data, - SmallVector &shapeRatio, SmallVector offsets, SmallVector &supportedShape, TypedValue newTensorDesc, @@ -173,6 +181,11 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); Value offsetX = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); Value offsetY = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + // Compute the ratio between original shape and supported shape. We need to + // generate loads in this ratio arrangement. + auto shapeRatio = computeShapeRatio(data.getType().getShape(), + supportedShape) + .value(); // `ratio` must be defined if we reach here. for (int64_t h = 0; h < shapeRatio[0]; ++h) { for (int64_t w = 0; w < shapeRatio[1]; ++w) { int64_t localOffsetX = h * supportedShape[0]; @@ -214,7 +227,7 @@ class XeGPUCreateNdDescOpPattern final matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto tdescTy = createNdOp.getType(); - auto convertType = tryConvertToTransposable(tdescTy); + auto convertType = tryOptimize(tdescTy); if (convertType == tdescTy) return failure(); auto strides = createNdOp.getMixedStrides(); @@ -291,17 +304,10 @@ class XeGPULoadNdDescOpPattern final // Get the 2D data shape of this loadNdOp in its original type including // array length. SmallVector origDataShape(origTensorDescType.getShape()); - // origDataShape.back() *= origTensorDescType.getArrayLength(); // Adjust the data shape based on innerLaneData. origDataShape.back() /= innerLaneData; // HW supported shape is the new tensor desc shape after conversion. SmallVector hwSupportedShape(adaptorType.getShape()); - // Shape ratio is 2D and, it describes how many blocks need to be loaded in - // HW supported shape to cover the original shape. - auto ratio = computeShapeRatio(origDataShape, hwSupportedShape) - .value(); // `ratio` must be defined if we reach here. - // Create a zero-initialized vector to hold all loaded blocks. - // TypedAttr zeroAttr = rewriter.getZeroAttr(adaptorType.getElementType()); VectorType origVectorType = VectorType::get(origDataShape, adaptorType.getElementType()); Value data; @@ -322,8 +328,8 @@ class XeGPULoadNdDescOpPattern final i * origDataShape[1])) .getResult(); slice = generateLoads( - rewriter, cast>(slice), ratio, - modifiedOffsets, hwSupportedShape, + rewriter, cast>(slice), modifiedOffsets, + hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); // BitCast back to original load shape without array length. @@ -344,7 +350,7 @@ class XeGPULoadNdDescOpPattern final VectorType::get(origDataShape, adaptorType.getElementType()), rewriter.getZeroAttr(origVectorType)); data = generateLoads( - rewriter, cast>(data), ratio, modifiedOffsets, + rewriter, cast>(data), modifiedOffsets, hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); From b55f6b072981646f2853792c341855d583867cf2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 29 Oct 2025 00:38:18 +0000 Subject: [PATCH 09/18] add comments --- .../Transforms/XeGPUOptimizeTranspose.cpp | 33 ++++++++++++++++--- .../Dialect/XeGPU/optimize-transpose.mlir | 16 ++++----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index 385a0dd470f03..bae2edfdfb890 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -167,20 +167,31 @@ static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, /// Helper to divide a Value by a constant integer. static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, Value val, int64_t constant) { + // If the constant is a power of 2, use right shift for division. + if (llvm::isPowerOf2_64(constant)) { + int64_t shiftAmount = llvm::Log2_64(constant); + return arith::ShRUIOp::create( + rewriter, loc, val, + createConstantIndex(rewriter, loc, shiftAmount)) + .getResult(); + } auto constantOp = createConstantIndex(rewriter, loc, constant); return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult(); } +/// This function takes a larger register block `data` and generates multiple +/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block +/// starting from `offsets`. static Value generateLoads(ConversionPatternRewriter &rewriter, TypedValue data, SmallVector offsets, - SmallVector &supportedShape, TypedValue newTensorDesc, xegpu::LoadNdOp origLoadOp) { Location loc = data.getLoc(); assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); Value offsetX = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); Value offsetY = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + SmallVector supportedShape(newTensorDesc.getType().getShape()); // Compute the ratio between original shape and supported shape. We need to // generate loads in this ratio arrangement. auto shapeRatio = computeShapeRatio(data.getType().getShape(), @@ -219,6 +230,10 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, return data; } +/// Checks is a CreateNdDescOp can be optimized for transpose, if so creates a +/// new CreateNdDescOp with optimized tensor desc type. This involves extracting +/// the base pointer from the original memory source and adjusting the shape and +/// strides of the tensor desc to fit with the new optimized transpose layout. class XeGPUCreateNdDescOpPattern final : public OpConversionPattern { public: @@ -277,6 +292,10 @@ class XeGPUCreateNdDescOpPattern final } }; +/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for +/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the +/// adjusted tensor desc type. This can result in multiple LoadNdOps being +/// generated to fill in the original load shape. class XeGPULoadNdDescOpPattern final : public OpConversionPattern { public: @@ -329,7 +348,6 @@ class XeGPULoadNdDescOpPattern final .getResult(); slice = generateLoads( rewriter, cast>(slice), modifiedOffsets, - hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); // BitCast back to original load shape without array length. @@ -351,7 +369,6 @@ class XeGPULoadNdDescOpPattern final rewriter.getZeroAttr(origVectorType)); data = generateLoads( rewriter, cast>(data), modifiedOffsets, - hwSupportedShape, cast>(adaptor.getTensorDesc()), loadNdOp); auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), @@ -364,6 +381,10 @@ class XeGPULoadNdDescOpPattern final } }; +/// Vector ExtractOp must be processed if the original tensor desc type has +/// array length greater than 1. In this case, the LoadNdOp is replaced with +/// multiple LoadNdOps for each array slice making the extraction unnecessary. +/// In this case, we simply remove the ExtractOp. class VectorExtractOpPattern final : public OpConversionPattern { public: @@ -371,6 +392,7 @@ class VectorExtractOpPattern final LogicalResult matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + // Check if the source of the extraction is split to multiple values. if (adaptor.getSource().size() == 1) return failure(); auto mixedPos = extractOp.getMixedPosition(); @@ -403,7 +425,7 @@ struct XeGPUOptimizeTransposePass final RewritePatternSet patterns(&context); ConversionTarget target(context); - // CreateNdDescOp and LoadNdOp with invalid transpose layout must be + // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { @@ -413,6 +435,9 @@ struct XeGPUOptimizeTransposePass final [&](xegpu::LoadNdOp loadNdOp) { return !canBeOptimized(loadNdOp.getTensorDescType()); }); + // Vector ExtractOps can have optimizable layouts if they extract from + // LoadNdOps with array length greater than 1. These ExtractOps must be + // converted. target.addDynamicallyLegalOp( [&](vector::ExtractOp extractOp) { auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index 7b3bd1bf8e4fe..c1dc2a32c2b0c 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -58,7 +58,7 @@ func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector< // CHECK-LABEL: func.func @gemm_b_transpose( // CHECK-SAME: %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[C256:.*]] = arith.constant 256 : index // CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index @@ -66,7 +66,7 @@ func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector< // CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1] // CHECK-SAME: : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { -// CHECK: %[[T7:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] // CHECK-SAME: {layout_result_0 = #xegpu.layout} : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> @@ -98,7 +98,7 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 // CHECK-LABEL: func.func @nested_scf( // CHECK-SAME: %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[C256:.*]] = arith.constant 256 : index // CHECK: scf.for %{{.*}} to %{{.*}} step %{{.*}} { @@ -107,7 +107,7 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 // CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> // CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { -// CHECK: %[[T7:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< // CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> @@ -143,13 +143,13 @@ func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %a // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C8:.*]] = arith.constant 8 : index // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32> -// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index // CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 // CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 // CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { -// CHECK: %[[T5:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] @@ -207,13 +207,13 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg // CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C8:.*]] = arith.constant 8 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index // CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 // CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 -> // CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout> // CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { -// CHECK: %[[T5:.*]] = arith.divui %[[K]], %[[C2]] : index +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index // CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout> -> vector<32x8xi32> // CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout} From 51e84aba095aafd70bf8ba095ce1c7a088ba1961 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 19:03:53 +0000 Subject: [PATCH 10/18] use uArch --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 9 +++ .../Transforms/XeGPUOptimizeTranspose.cpp | 68 ++++++++++++++----- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 36 ++-------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 26 +++++++ 4 files changed, 94 insertions(+), 45 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 04cfd58d846a7..f23174cb0f697 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -162,6 +162,15 @@ SmallVector addElementwise(OpBuilder &builder, Location loc, SmallVector addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs); + +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or +/// array length). +template +int getLargestDivisor(T dim, ArrayRef candidates, + ArrayRef candidateMultiples = {}); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp index bae2edfdfb890..6e09f4e1b2830 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" @@ -16,6 +17,8 @@ #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" +#include "mlir/Dialect/XeGPU/uArch/uArchBase.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" @@ -24,6 +27,7 @@ #include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include #include @@ -111,7 +115,8 @@ static bool canBeOptimized(xegpu::TensorDescType tdescType) { /// Check if a tensor desc type can be optimized for transpose, if so return the /// new optimized tensor desc type with a valid transpose layout. -static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType) { +static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, + const uArch *targetuArch) { if (!canBeOptimized(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType).value(); @@ -127,26 +132,34 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType) { Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth); // Supported shape is the max transpose shape that can be supported by // hardware that is less than or equal to required shape. - auto supportedHeight = std::min( - requiredShape[0], getTransposableBlockRange(newBitWidth).maxHeight); - auto supportedWidth = std::min( - requiredShape[1], getTransposableBlockRange(newBitWidth).maxWidth); - SmallVector supportedShape = {supportedHeight, supportedWidth}; - - // Required shape must be multiple of supported shape. Otherwise, we can not - // optimize it. - // TODO: Supported shape can be adjusted to handle non-multiple cases. - if (requiredShape[0] % supportedShape[0] != 0 || - requiredShape[1] % supportedShape[1] != 0) + auto *blockLoadTarget = dyn_cast( + targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad)); + auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount( + newElemTy, /** has transform */ false, /** has transpose */ true); + // If no HW params found, return the original type. + if (!maybeHWParams) + return tdescType; + auto [widths, heights, counts] = maybeHWParams.value(); + // TODO: Currently we expect array length to be 1 for transpose case. + if (counts.size() != 1 || counts[0] != 1) + return tdescType; + int arrayLen = counts[0]; + int supportedHeight = + xegpu::getLargestDivisor(static_cast(requiredShape[0]), heights); + int supportedWidth = + xegpu::getLargestDivisor(static_cast(requiredShape[1]), widths); + // If no supported height or width found, return the original type. + if (supportedHeight == -1 || supportedWidth == -1) return tdescType; + SmallVector supportedShape = {supportedHeight, supportedWidth}; xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); // Array length can not be larger than 1 for transpose case. - return xegpu::TensorDescType::get( - supportedShape, newElemTy, /**array length**/ 1, - tdescType.getBoundaryCheck(), tdescType.getMemorySpace(), newLayout); + return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen, + tdescType.getBoundaryCheck(), + tdescType.getMemorySpace(), newLayout); } /// Helper to create a constant index value. @@ -242,7 +255,15 @@ class XeGPUCreateNdDescOpPattern final matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto tdescTy = createNdOp.getType(); - auto convertType = tryOptimize(tdescTy); + // Get the target uArch info. + auto chipStr = xegpu::getChipStr(createNdOp); + // Check if the chip is supported. + assert( + chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") && + "Expecting target chip to be pvc or bmg for transpose optimization."); + const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value()); + + auto convertType = tryOptimize(tdescTy, targetuArch); if (convertType == tdescTy) return failure(); auto strides = createNdOp.getMixedStrides(); @@ -425,6 +446,21 @@ struct XeGPUOptimizeTransposePass final RewritePatternSet patterns(&context); ConversionTarget target(context); + // This pass is only meant for PVC and BMG targets. If unsupported target + // is found, exit early. + bool isTargetSupported = true; + getOperation()->walk([&](gpu::GPUFuncOp funcOp) { + auto chipStr = xegpu::getChipStr(funcOp); + if (chipStr && chipStr.value() != "pvc" && chipStr.value() != "bmg") + isTargetSupported = false; + }); + + if (!isTargetSupported) { + DBGS() << "XeGPUOptimizeTransposePass only supports PVC and BMG targets." + << "\n"; + return; + } + // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be // converted. target.addDynamicallyLegalOp( diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 90eae871a5ef3..59ea7aa978eab 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -204,28 +204,6 @@ struct LayoutInfoLattice : public Lattice { using Lattice::Lattice; }; -/// Helper Function to find a proper instruction multiple for the user-supplied -/// sg-level data shape. `candidates` are uArch allowed shapes. -/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). -template -int getLargestDivisor(T dim, ArrayRef candidates, - ArrayRef candidateMultiples = {}) { - static_assert(std::is_integral::value, "T must be an integer type"); - int largest = -1; - SmallVector multiples = {1}; - if (!candidateMultiples.empty()) - multiples = - SmallVector(candidateMultiples.begin(), candidateMultiples.end()); - for (T candidate : candidates) { - for (T multiple : multiples) { - int value = static_cast(candidate * multiple); - if (value != 0 && dim % value == 0 && value > largest) - largest = value; - } - } - return largest; -} - /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -505,7 +483,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( prefetch.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -514,7 +492,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( if (tdescTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); if (instHeight == -1) prefetch.emitWarning( @@ -634,7 +612,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataALen = aTy.getShape().front(); auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); const int maxALen = - getLargestDivisor(dataALen, ArrayRef(supportedALen)); + xegpu::getLargestDivisor(dataALen, ArrayRef(supportedALen)); if (maxALen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -642,7 +620,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataBLen = bTy.getShape().back(); auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); const int maxBLen = - getLargestDivisor(dataBLen, ArrayRef(supportedBLen)); + xegpu::getLargestDivisor(dataBLen, ArrayRef(supportedBLen)); if (maxBLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -662,7 +640,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataCLen = bTy.getShape().back(); auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); const int maxCLen = - getLargestDivisor(dataCLen, ArrayRef(supportedCLen)); + xegpu::getLargestDivisor(dataCLen, ArrayRef(supportedCLen)); if (maxCLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -691,7 +669,7 @@ void LayoutInfoPropagation::visitStoreNdOp( store.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -700,7 +678,7 @@ void LayoutInfoPropagation::visitStoreNdOp( if (dataTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); if (instHeight == -1) store.emitWarning( diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index a38993e0c55b1..8fa114977c2fa 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -500,3 +500,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc, results.append(addElementwise(builder, loc, a, b)); return results; } + +template +int xegpu::getLargestDivisor(T dim, ArrayRef candidates, + ArrayRef candidateMultiples) { + static_assert(std::is_integral::value, "T must be an integer type"); + int largest = -1; + SmallVector multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + +/// Explicit instantiations +template int xegpu::getLargestDivisor(int dim, ArrayRef candidates, + ArrayRef candidateMultiples); +template int +xegpu::getLargestDivisor(unsigned dim, ArrayRef candidates, + ArrayRef candidateMultiples); From 9b694ad174e47b8f3c0f548a8b42657b6f3a8bf8 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 21:15:44 +0000 Subject: [PATCH 11/18] change pass name --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 10 ++-- .../Dialect/XeGPU/Transforms/Transforms.h | 4 +- .../Dialect/XeGPU/Transforms/CMakeLists.txt | 2 +- ...nspose.cpp => XeGPUOptimizeBlockLoads.cpp} | 42 +++++----------- .../Dialect/XeGPU/optimize-transpose.mlir | 48 ++++++++++++------- 5 files changed, 52 insertions(+), 54 deletions(-) rename mlir/lib/Dialect/XeGPU/Transforms/{XeGPUOptimizeTranspose.cpp => XeGPUOptimizeBlockLoads.cpp} (94%) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index fb3cb9f471608..23234891dd348 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -85,11 +85,13 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> { "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"]; } -def XeGPUOptimizeTranspose : Pass<"xegpu-optimize-transpose"> { - let summary = "Optimize XeGPU loadNd operations feeding into vector.transpose"; +def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> { + let summary = "Optimize XeGPU block load operations"; let description = [{ - This pass rewrites XeGPU loadNd operations that feed into vector.transpose - into more optimal forms to improve performance. + This pass rewrites XeGPU loadNd operations into more optimal forms + to improve performance. This includes, + - Rewriting transpose B loads into more optimal forms to use HW block + transpose instructions for better performance. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index cab1598457cbe..1776a209d0bf1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -61,8 +61,8 @@ struct UnrollOptions { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); -/// Appends patterns for optimizing transpose operations into `patterns`. -void populateXeGPUOptimizeTransposePatterns(RewritePatternSet &patterns); +/// Appends patterns for optimizing block load operations into `patterns`. +void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index a2ecbc3374ba9..29b645feab2c6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -6,7 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUWgToSgDistribute.cpp XeGPUPropagateLayout.cpp XeGPUVectorLinearize.cpp - XeGPUOptimizeTranspose.cpp + XeGPUOptimizeBlockLoads.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp similarity index 94% rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 6e09f4e1b2830..9a490c6c3cc3f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeTranspose.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -1,4 +1,4 @@ -//===- XeGPUOptimizeTranspose.cpp - XeGPU optimize transpose ----*- C++ -*-===// +//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -34,34 +34,18 @@ namespace mlir { namespace xegpu { -#define GEN_PASS_DEF_XEGPUOPTIMIZETRANSPOSE +#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" } // namespace xegpu } // namespace mlir -#define DEBUG_TYPE "xegpu-optimize-transpose" +#define DEBUG_TYPE "xegpu-optimize-block-loads" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") using namespace mlir; namespace { -struct Allowed2DShapeRange { - int64_t minWidth, maxWidth, minHeight, maxHeight; -}; - -/// Helper to get the size range of a 2D block that can be transposed by HW. -/// TODO: Use uArch to get supported block ranges. -static Allowed2DShapeRange getTransposableBlockRange(int bitWidth) { - switch (bitWidth) { - case 32: - return {/**min width**/ 1, /**max width**/ 8, /**min height**/ 1, - /**max height**/ 32}; - default: - llvm_unreachable("Add support for other element bitwidths"); - } -} - /// Get the 2D lane data from a tensor desc type if it exists. static std::optional> getMaybeLaneData(xegpu::TensorDescType tdescType) { @@ -429,7 +413,7 @@ class VectorExtractOpPattern final } // namespace -void xegpu::populateXeGPUOptimizeTransposePatterns( +void xegpu::populateXeGPUOptimizeBlockLoadsPatterns( RewritePatternSet &patterns) { patterns.add(patterns.getContext()); @@ -437,9 +421,9 @@ void xegpu::populateXeGPUOptimizeTransposePatterns( namespace { -struct XeGPUOptimizeTransposePass final - : public xegpu::impl::XeGPUOptimizeTransposeBase< - XeGPUOptimizeTransposePass> { +struct XeGPUOptimizeBlockLoadsPass final + : public xegpu::impl::XeGPUOptimizeBlockLoadsBase< + XeGPUOptimizeBlockLoadsPass> { void runOnOperation() override { MLIRContext &context = getContext(); TypeConverter converter; @@ -448,15 +432,15 @@ struct XeGPUOptimizeTransposePass final // This pass is only meant for PVC and BMG targets. If unsupported target // is found, exit early. - bool isTargetSupported = true; + bool isTargetSupported = false; getOperation()->walk([&](gpu::GPUFuncOp funcOp) { auto chipStr = xegpu::getChipStr(funcOp); - if (chipStr && chipStr.value() != "pvc" && chipStr.value() != "bmg") - isTargetSupported = false; + if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg")) + isTargetSupported = true; }); if (!isTargetSupported) { - DBGS() << "XeGPUOptimizeTransposePass only supports PVC and BMG targets." + DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets." << "\n"; return; } @@ -489,10 +473,10 @@ struct XeGPUOptimizeTransposePass final vector::VectorDialect>(); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); - xegpu::populateXeGPUOptimizeTransposePatterns(patterns); + xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) { - DBGS() << "Optimize transpose pass failed.\n"; + DBGS() << "Optimize block loads pass failed.\n"; return signalPassFailure(); } } diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index c1dc2a32c2b0c..e1b7ce9a57e61 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt -xegpu-optimize-transpose -canonicalize -split-input-file %s | FileCheck %s -// CHECK-LABEL: func.func @no_scf( +// CHECK-LABEL: gpu.func @no_scf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[C32:.*]] = arith.constant 32 : index @@ -16,18 +16,20 @@ #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout -func.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> { +gpu.module @xevm_module { +gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b> %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - return %6 : vector<8x16xf32> + gpu.return %6 : vector<8x16xf32> +} } // ----- -// CHECK-LABEL: func.func @no_scf_i8( +// CHECK-LABEL: gpu.func @no_scf_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> { // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index @@ -43,19 +45,21 @@ func.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8 #b = #xegpu.layout #bt = #xegpu.layout #c = #xegpu.layout -func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { +gpu.module @xevm_module { +gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b> %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8> %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8> %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> - return %6 : vector<8x16xi32> + gpu.return %6 : vector<8x16xi32> +} } // ----- -// CHECK-LABEL: func.func @gemm_b_transpose( +// CHECK-LABEL: gpu.func @gemm_b_transpose( // CHECK-SAME: %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index @@ -75,7 +79,8 @@ func.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector< #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout -func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +gpu.module @xevm_module { +gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c256 = arith.constant 256 : index @@ -91,11 +96,12 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 scf.yield %8 : vector<8x16xf32> } {layout_result_0 = #a} xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> - return + gpu.return +} } // ----- -// CHECK-LABEL: func.func @nested_scf( +// CHECK-LABEL: gpu.func @nested_scf( // CHECK-SAME: %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index @@ -116,7 +122,8 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1 #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout -func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +gpu.module @xevm_module { +gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c256 = arith.constant 256 : index @@ -134,11 +141,12 @@ func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %a } {layout_result_0 = #a} xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> } - return + gpu.return +} } // ----- -// CHECK-LABEL: func.func @large_loads( +// CHECK-LABEL: gpu.func @large_loads( // CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C8:.*]] = arith.constant 8 : index @@ -166,7 +174,8 @@ func.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %a #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout -func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +gpu.module @xevm_module { +gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -199,11 +208,12 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> - return + gpu.return +} } // ----- -// CHECK-LABEL: func.func @array_length( +// CHECK-LABEL: gpu.func @array_length( // CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) { // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[C8:.*]] = arith.constant 8 : index @@ -226,7 +236,8 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg #a = #xegpu.layout #b = #xegpu.layout #bt = #xegpu.layout -func.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +gpu.module @xevm_module { +gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -263,5 +274,6 @@ func.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %ar xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> - return + gpu.return +} } From 4f00ec465f7e586b44a75bd0e34018ffde3ebf7a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 21:41:24 +0000 Subject: [PATCH 12/18] address comments --- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 9a490c6c3cc3f..a59529b38f2e1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -103,7 +103,8 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, const uArch *targetuArch) { if (!canBeOptimized(tdescType)) return tdescType; - auto laneData = getMaybeLaneData(tdescType).value(); + auto laneData = getMaybeLaneData(tdescType) + .value(); // Lane data must exist if we reach here. int64_t innerLaneData = laneData[1]; int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); // Required shape is total shape of the vector result that this tensor desc @@ -146,18 +147,12 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, tdescType.getMemorySpace(), newLayout); } -/// Helper to create a constant index value. -static Value createConstantIndex(ConversionPatternRewriter &rewriter, - Location loc, int64_t value) { - return arith::ConstantIndexOp::create(rewriter, loc, value).getResult(); -} - /// Helper to convert an OpFoldResult to Value. static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, OpFoldResult ofr) { std::optional mayBeInt = getConstantIntValue(ofr); if (mayBeInt) - return createConstantIndex(rewriter, loc, *mayBeInt); + return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult(); return llvm::cast(ofr); } @@ -169,10 +164,12 @@ static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, int64_t shiftAmount = llvm::Log2_64(constant); return arith::ShRUIOp::create( rewriter, loc, val, - createConstantIndex(rewriter, loc, shiftAmount)) + arith::ConstantIndexOp::create(rewriter, loc, shiftAmount) + .getResult()) .getResult(); } - auto constantOp = createConstantIndex(rewriter, loc, constant); + auto constantOp = + arith::ConstantIndexOp::create(rewriter, loc, constant).getResult(); return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult(); } @@ -186,8 +183,8 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, xegpu::LoadNdOp origLoadOp) { Location loc = data.getLoc(); assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); - Value offsetX = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); - Value offsetY = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); + Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); SmallVector supportedShape(newTensorDesc.getType().getShape()); // Compute the ratio between original shape and supported shape. We need to // generate loads in this ratio arrangement. @@ -196,14 +193,16 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, .value(); // `ratio` must be defined if we reach here. for (int64_t h = 0; h < shapeRatio[0]; ++h) { for (int64_t w = 0; w < shapeRatio[1]; ++w) { - int64_t localOffsetX = h * supportedShape[0]; - int64_t localOffsetY = w * supportedShape[1]; + int64_t localOffsetDim0 = h * supportedShape[0]; + int64_t localOffsetDim1 = w * supportedShape[1]; Value loadOffsetX = arith::AddIOp::create( - rewriter, loc, offsetX, - createConstantIndex(rewriter, loc, localOffsetX)); + rewriter, loc, offsetDim0, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0) + .getResult()); Value loadOffsetY = arith::AddIOp::create( - rewriter, loc, offsetY, - createConstantIndex(rewriter, loc, localOffsetY)); + rewriter, loc, offsetDim1, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1) + .getResult()); auto loadOp = xegpu::LoadNdOp::create( rewriter, loc, VectorType::get(supportedShape, data.getType().getElementType()), @@ -217,7 +216,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, // Insert the loaded block into the right position in data. auto insertOp = vector::InsertStridedSliceOp::create( rewriter, loc, loadOp.getResult(), data, - ArrayRef{localOffsetX, localOffsetY}, + ArrayRef{localOffsetDim0, localOffsetDim1}, ArrayRef{1, 1}); // InsertOp must have the same layout as newTensorDesc. xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); @@ -227,7 +226,7 @@ static Value generateLoads(ConversionPatternRewriter &rewriter, return data; } -/// Checks is a CreateNdDescOp can be optimized for transpose, if so creates a +/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a /// new CreateNdDescOp with optimized tensor desc type. This involves extracting /// the base pointer from the original memory source and adjusting the shape and /// strides of the tensor desc to fit with the new optimized transpose layout. @@ -254,7 +253,8 @@ class XeGPUCreateNdDescOpPattern final auto maybeConstInnerStride = getConstantIntValue(strides.back()); // Only row-major memrefs are expected for now. if (!maybeConstInnerStride || *maybeConstInnerStride != 1) - return failure(); + return rewriter.notifyMatchFailure( + createNdOp, "Expecting row-major memref for transpose optimization."); Value source = createNdOp.getSource(); auto optionalLaneData = getMaybeLaneData(tdescTy); assert(optionalLaneData && "Expected 2D lane data"); @@ -282,10 +282,9 @@ class XeGPUCreateNdDescOpPattern final if (memrefType && memrefType.hasStaticShape()) { auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create( rewriter, createNdOp.getLoc(), source); - source = arith::IndexCastOp::create( - rewriter, createNdOp.getLoc(), - IntegerType::get(rewriter.getContext(), 64), - extractOp.getResult()) + source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(), + rewriter.getI64Type(), + extractOp.getResult()) .getResult(); } // Create a new CreateNdDescOp with the modified shape and converted type. @@ -346,10 +345,11 @@ class XeGPULoadNdDescOpPattern final Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), modifiedOffsets.back()); modifiedOffsets.back() = - arith::AddIOp::create(rewriter, loadNdOp->getLoc(), offsetY, - createConstantIndex(rewriter, - loadNdOp->getLoc(), - i * origDataShape[1])) + arith::AddIOp::create( + rewriter, loadNdOp->getLoc(), offsetY, + arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(), + i * origDataShape[1]) + .getResult()) .getResult(); slice = generateLoads( rewriter, cast>(slice), modifiedOffsets, From 60ec9f5cf8f390ec010064b957f46ee089e9caf1 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 21:48:18 +0000 Subject: [PATCH 13/18] address comments --- .../XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index a59529b38f2e1..23a045946ad6e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -72,8 +72,8 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { /// A layout can be optimized if its lane layout is transposed (lane[0] != 1 && /// lane[1] == 1), but inner lane data is not equal to [1, 1]. -static bool canBeOptimized(ArrayRef laneLayout, - ArrayRef laneData) { +static bool canBeOptimizedForTranspose(ArrayRef laneLayout, + ArrayRef laneData) { if (laneLayout.size() != 2 || laneData.size() != 2) return false; if (laneLayout[0] == 1 || laneLayout[1] != 1) @@ -85,7 +85,7 @@ static bool canBeOptimized(ArrayRef laneLayout, /// A tensor desc type can be optimized if its element type is less than 32 bits /// and its layout can be optimized. -static bool canBeOptimized(xegpu::TensorDescType tdescType) { +static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { // If the dtype is greater or equal to 32 bits, layout must be valid. int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); if (elementTyBitwidth >= 32) @@ -94,14 +94,14 @@ static bool canBeOptimized(xegpu::TensorDescType tdescType) { auto maybeLaneData = getMaybeLaneData(tdescType); if (!maybeLaneData || !maybeLaneLayout) return false; - return canBeOptimized(*maybeLaneLayout, *maybeLaneData); + return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData); } /// Check if a tensor desc type can be optimized for transpose, if so return the /// new optimized tensor desc type with a valid transpose layout. static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, const uArch *targetuArch) { - if (!canBeOptimized(tdescType)) + if (!canBeOptimizedForTranspose(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType) .value(); // Lane data must exist if we reach here. @@ -449,11 +449,11 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { - return !canBeOptimized(createNdOp.getType()); + return !canBeOptimizedForTranspose(createNdOp.getType()); }); target.addDynamicallyLegalOp( [&](xegpu::LoadNdOp loadNdOp) { - return !canBeOptimized(loadNdOp.getTensorDescType()); + return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType()); }); // Vector ExtractOps can have optimizable layouts if they extract from // LoadNdOps with array length greater than 1. These ExtractOps must be @@ -465,7 +465,7 @@ struct XeGPUOptimizeBlockLoadsPass final return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); auto laneData = layout.getEffectiveLaneDataAsInt(); - return !canBeOptimized(laneLayout, laneData); + return !canBeOptimizedForTranspose(laneLayout, laneData); }); converter.addConversion([](Type type) { return type; }); From c8590bb21af8a6f9a57c7a5e560e3ebb6bd3c724 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 21:59:27 +0000 Subject: [PATCH 14/18] address comments --- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 23a045946ad6e..f9799070d84f0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -72,8 +72,17 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { /// A layout can be optimized if its lane layout is transposed (lane[0] != 1 && /// lane[1] == 1), but inner lane data is not equal to [1, 1]. -static bool canBeOptimizedForTranspose(ArrayRef laneLayout, - ArrayRef laneData) { +/// Example: +/// !xegpu.tensor_desc<16x16xf16, +/// #xegpu.layout> +/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form) +/// indicating that this is a load that requires transpose effect. However, +/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements form +/// the inner dimension. We convert this to a canonical form by converting the +/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the +/// later lowering can easily use the load with transpose instruction. +static bool canBeCanonicalizedForTranspose(ArrayRef laneLayout, + ArrayRef laneData) { if (laneLayout.size() != 2 || laneData.size() != 2) return false; if (laneLayout[0] == 1 || laneLayout[1] != 1) @@ -85,7 +94,7 @@ static bool canBeOptimizedForTranspose(ArrayRef laneLayout, /// A tensor desc type can be optimized if its element type is less than 32 bits /// and its layout can be optimized. -static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { +static bool canBeCanonicalizedForTranspose(xegpu::TensorDescType tdescType) { // If the dtype is greater or equal to 32 bits, layout must be valid. int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); if (elementTyBitwidth >= 32) @@ -94,14 +103,14 @@ static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { auto maybeLaneData = getMaybeLaneData(tdescType); if (!maybeLaneData || !maybeLaneLayout) return false; - return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData); + return canBeCanonicalizedForTranspose(*maybeLaneLayout, *maybeLaneData); } /// Check if a tensor desc type can be optimized for transpose, if so return the /// new optimized tensor desc type with a valid transpose layout. static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, const uArch *targetuArch) { - if (!canBeOptimizedForTranspose(tdescType)) + if (!canBeCanonicalizedForTranspose(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType) .value(); // Lane data must exist if we reach here. @@ -449,11 +458,11 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { - return !canBeOptimizedForTranspose(createNdOp.getType()); + return !canBeCanonicalizedForTranspose(createNdOp.getType()); }); target.addDynamicallyLegalOp( [&](xegpu::LoadNdOp loadNdOp) { - return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType()); + return !canBeCanonicalizedForTranspose(loadNdOp.getTensorDescType()); }); // Vector ExtractOps can have optimizable layouts if they extract from // LoadNdOps with array length greater than 1. These ExtractOps must be @@ -465,7 +474,7 @@ struct XeGPUOptimizeBlockLoadsPass final return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); auto laneData = layout.getEffectiveLaneDataAsInt(); - return !canBeOptimizedForTranspose(laneLayout, laneData); + return !canBeCanonicalizedForTranspose(laneLayout, laneData); }); converter.addConversion([](Type type) { return type; }); From 1af68c7648bea60884078e637e8d7493179bb347 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 22:07:48 +0000 Subject: [PATCH 15/18] address comments --- mlir/test/Dialect/XeGPU/optimize-transpose.mlir | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir index e1b7ce9a57e61..24a0de6ed48a5 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt -xegpu-optimize-transpose -canonicalize -split-input-file %s | FileCheck %s +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @no_scf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { From 22e25a9a250d5da54fcb9c5d7550449cc6c6fb74 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 22:13:25 +0000 Subject: [PATCH 16/18] remove unused headers --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index f9799070d84f0..684738122e446 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -19,17 +19,13 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/Dialect/XeGPU/uArch/uArchBase.h" -#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/Types.h" #include "mlir/IR/Value.h" -#include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include #include namespace mlir { From f70c07c04309b0165db094d2961ab8462af51753 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 3 Nov 2025 22:25:32 +0000 Subject: [PATCH 17/18] fix comment --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index 684738122e446..f4da22aa2fa13 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -73,10 +73,10 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { /// #xegpu.layout> /// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form) /// indicating that this is a load that requires transpose effect. However, -/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements form +/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from /// the inner dimension. We convert this to a canonical form by converting the /// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the -/// later lowering can easily use the load with transpose instruction. +/// later lowering easily use the load with transpose instruction. static bool canBeCanonicalizedForTranspose(ArrayRef laneLayout, ArrayRef laneData) { if (laneLayout.size() != 2 || laneData.size() != 2) From 51f4c4bef65eb451251304615ed38b8748a00902 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 4 Nov 2025 17:22:24 +0000 Subject: [PATCH 18/18] fix comment --- .../Transforms/XeGPUOptimizeBlockLoads.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp index f4da22aa2fa13..4dc5ea4f7bb24 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -74,11 +74,11 @@ getMaybeLaneLayout(xegpu::TensorDescType tdescType) { /// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form) /// indicating that this is a load that requires transpose effect. However, /// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from -/// the inner dimension. We convert this to a canonical form by converting the +/// the inner dimension. We convert this to a optimized form by converting the /// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the /// later lowering easily use the load with transpose instruction. -static bool canBeCanonicalizedForTranspose(ArrayRef laneLayout, - ArrayRef laneData) { +static bool canBeOptimizedForTranspose(ArrayRef laneLayout, + ArrayRef laneData) { if (laneLayout.size() != 2 || laneData.size() != 2) return false; if (laneLayout[0] == 1 || laneLayout[1] != 1) @@ -90,7 +90,7 @@ static bool canBeCanonicalizedForTranspose(ArrayRef laneLayout, /// A tensor desc type can be optimized if its element type is less than 32 bits /// and its layout can be optimized. -static bool canBeCanonicalizedForTranspose(xegpu::TensorDescType tdescType) { +static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { // If the dtype is greater or equal to 32 bits, layout must be valid. int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); if (elementTyBitwidth >= 32) @@ -99,14 +99,14 @@ static bool canBeCanonicalizedForTranspose(xegpu::TensorDescType tdescType) { auto maybeLaneData = getMaybeLaneData(tdescType); if (!maybeLaneData || !maybeLaneLayout) return false; - return canBeCanonicalizedForTranspose(*maybeLaneLayout, *maybeLaneData); + return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData); } /// Check if a tensor desc type can be optimized for transpose, if so return the /// new optimized tensor desc type with a valid transpose layout. static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, const uArch *targetuArch) { - if (!canBeCanonicalizedForTranspose(tdescType)) + if (!canBeOptimizedForTranspose(tdescType)) return tdescType; auto laneData = getMaybeLaneData(tdescType) .value(); // Lane data must exist if we reach here. @@ -454,11 +454,11 @@ struct XeGPUOptimizeBlockLoadsPass final // converted. target.addDynamicallyLegalOp( [&](xegpu::CreateNdDescOp createNdOp) { - return !canBeCanonicalizedForTranspose(createNdOp.getType()); + return !canBeOptimizedForTranspose(createNdOp.getType()); }); target.addDynamicallyLegalOp( [&](xegpu::LoadNdOp loadNdOp) { - return !canBeCanonicalizedForTranspose(loadNdOp.getTensorDescType()); + return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType()); }); // Vector ExtractOps can have optimizable layouts if they extract from // LoadNdOps with array length greater than 1. These ExtractOps must be @@ -470,7 +470,7 @@ struct XeGPUOptimizeBlockLoadsPass final return true; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); auto laneData = layout.getEffectiveLaneDataAsInt(); - return !canBeCanonicalizedForTranspose(laneLayout, laneData); + return !canBeOptimizedForTranspose(laneLayout, laneData); }); converter.addConversion([](Type type) { return type; });