From 777a403f896d811dbe36a7aed6ccacf6adf9c833 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 12 May 2025 19:36:58 +0000 Subject: [PATCH 01/41] add utils --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 15 +++++++ .../Transforms/XeGPUSubgroupDistribute.cpp | 27 +++++-------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 40 +++++++++++++++++++ 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 3616fa614e7f9..5c2a308887040 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -13,6 +13,9 @@ namespace mlir { class VectorType; +class OpOperand; +class OpResult; + namespace xegpu { class LayoutAttr; class TensorDescType; @@ -50,6 +53,18 @@ FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); +/// Retrieves the LayoutAttr associated with a given Value. For TensorDescType +/// values, the LayoutAttr is extracted from the TensorDescType itself. For +/// other values, it is obtained from the attributes of the defining operation. +/// Returns nullptr if no LayoutAttr is found. +LayoutAttr getLayoutAttr(Value value); + +/// Retrieves the name for the LayoutAttr associated with a given OpOperand. +std::string getLayoutName(OpOperand &opr); + +/// Retrieves the name for the LayoutAttr associated with a given OpResult. +std::string getLayoutName(OpResult res); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 2300d9e3bd43f..ca887bd0fb7b5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -62,8 +62,6 @@ constexpr unsigned packedSizeInBitsForDefault = 16; // Minimum packing size per register for DPAS A. constexpr unsigned packedSizeInBitsForDpasB = 32; // Minimum packing size per register for DPAS B. -static const char *const operandLayoutNamePrefix = "layout_operand_"; -static const char *const resultLayoutNamePrefix = "layout_result_"; namespace { @@ -728,10 +726,7 @@ class LayoutAttrAssignment { void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { for (OpOperand &user : v.getUses()) { Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Use a generic name for ease of querying the layout attribute later. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); + std::string attrName = xegpu::getLayoutName(user); owner->setAttr(attrName, layout); } } @@ -805,10 +800,10 @@ LogicalResult LayoutAttrAssignment::assign(Operation *op) { return success(); } // Otherwise simply attach the layout to the op itself. - for (auto [i, r] : llvm::enumerate(op->getResults())) { + for (auto r : op->getOpResults()) { xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r); if (layoutInfo) { - std::string attrName = resultLayoutNamePrefix + std::to_string(i); + std::string attrName = xegpu::getLayoutName(r); op->setAttr(attrName, layoutInfo); // Attach the layout attribute to the users of the result. assignToUsers(r, layoutInfo); @@ -928,11 +923,8 @@ static SmallVector removeTemporaryLayoutAttributes(ArrayRef attrs) { SmallVector newAttrs; for (NamedAttribute attr : attrs) { - if (attr.getName().strref().contains(operandLayoutNamePrefix) || - attr.getName().strref().contains(resultLayoutNamePrefix)) { - continue; - } - newAttrs.push_back(attr); + if (!isa(attr.getValue())) + newAttrs.push_back(attr); } return newAttrs; } @@ -1335,11 +1327,10 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { auto dpasOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - std::string layoutAName = - llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str(); - std::string layoutBName = - llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str(); - auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str(); + std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0)); + std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1)); + std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0)); + xegpu::LayoutAttr layoutA = dpasOp->getAttrOfType(layoutAName); xegpu::LayoutAttr layoutB = diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6b45ed0ae4ced..d101ce07043ec 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -12,6 +12,8 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Operation.h" +#include "llvm/Support/FormatVariadic.h" #include #include @@ -83,3 +85,41 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, /*memory_space=*/xegpu::MemorySpace::Global, layout); return xegpu::getDistributedVectorType(helperTdescTy); } + +xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { + if (!value) + return LayoutAttr(); + + if (auto tdescTy = dyn_cast(value.getType())) + return tdescTy.getLayoutAttr(); + + if (auto result = dyn_cast(value)) { + Operation *defOp = result.getDefiningOp(); + assert(defOp && "result must have a defining op"); + std::string layoutName = getLayoutName(result); + if (defOp->hasAttr(layoutName)) + return defOp->getAttrOfType(layoutName); + } + + if (auto arg = dyn_cast(value)) { + auto parentOp = arg.getOwner()->getParentOp(); + if (auto funcOp = dyn_cast(parentOp)) { + std::string layoutName = getLayoutName(arg); + if (funcOp->hasAttr(layoutName)) + return funcOp->getAttrOfType(layoutName); + } + } + + return nullptr; +} + +std::string xegpu::getLayoutName(OpOperand &opr) { + const StringRef prefix("layout_operand_"); + return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str(); +} + +std::string xegpu::getLayoutName(OpResult res) { + const StringRef prefix = "layout_result_"; + return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); +} + From af01c99481e1a88fef78b2517cf9b2f531acbd9f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 12 May 2025 19:37:07 +0000 Subject: [PATCH 02/41] add skeleton --- mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 12 ++++++++++++ mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt | 1 + 2 files changed, 13 insertions(+) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 3e81f2d0ed786..54782933fe5f8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -38,4 +38,16 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { ]; } +def XeGPUInstructionlize: Pass<"xegpu-instructionlize"> { + let summary = "Instructionlize XeGPU ops"; + let description = [{ + The pass unrolls XeGPU ops working on large shapes into ops working on small shapes + (given by the inst_data in the layout attr), such that each of them can be dispatch + into a hardware instruction. + }]; + let dependentDialects = [ + "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect" + ]; +} + #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 892eb791c46e7..1d94b4c4c03ac 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUFoldAliasOps.cpp + XeGPUInstructionlize.cpp XeGPUSubgroupDistribute.cpp XeGPUUnroll.cpp From e8b43fbfe2b3764dc804b13975154b0f584c7d9b Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 13 May 2025 00:44:02 +0000 Subject: [PATCH 03/41] add filter --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 4 ++++ mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 ++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 032ce5bc18334..3f5fe2cce4636 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -295,11 +295,15 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> { } LayoutAttr dropSgLayoutAndData() { + if (!getInstData() && !getLaneLayout()) + return nullptr; return LayoutAttr::get(getContext(), nullptr, nullptr, getInstData(), getLaneLayout(), getLaneData(), getOrder()); } LayoutAttr dropInstData() { + if (!getSgLayout() && !getLaneLayout()) + return nullptr; return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr, getLaneLayout(), getLaneData(), getOrder()); } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index d101ce07043ec..285a15062e402 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Operation.h" +#include "mlir/Interfaces/LoopLikeInterface.h" #include "llvm/Support/FormatVariadic.h" #include #include @@ -88,7 +89,7 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { if (!value) - return LayoutAttr(); + return nullptr; if (auto tdescTy = dyn_cast(value.getType())) return tdescTy.getLayoutAttr(); @@ -96,6 +97,11 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); + + // for LoadNdOp, the layout is stored in the tensor descriptor + if (auto loadNd = dyn_cast(defOp)) + return getLayoutAttr(loadNd.getTensorDesc()); + std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); @@ -103,10 +109,9 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { if (auto arg = dyn_cast(value)) { auto parentOp = arg.getOwner()->getParentOp(); - if (auto funcOp = dyn_cast(parentOp)) { - std::string layoutName = getLayoutName(arg); - if (funcOp->hasAttr(layoutName)) - return funcOp->getAttrOfType(layoutName); + if (auto loop = dyn_cast(parentOp)) { + OpOperand *tiedInit = loop.getTiedLoopInit(arg); + return getLayoutAttr(tiedInit->get()); } } @@ -122,4 +127,3 @@ std::string xegpu::getLayoutName(OpResult res) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); } - From 3f73fda71e833ef844eec19bd2eda0f3b6b31020 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 13 May 2025 01:06:29 +0000 Subject: [PATCH 04/41] clean up --- .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp new file mode 100644 index 0000000000000..b83ce86a357f0 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -0,0 +1,143 @@ +//===---- XeGPUInstructionlize.cpp -- XeGPU Instructionlize Pass ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUINSTRUCTIONLIZE +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-instructionlize" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; + +namespace { + +/// Unroll XeGPU ops to their instruction-level representation. +class XeGPUInstructionlizePass final + : public xegpu::impl::XeGPUInstructionlizeBase { +public: + void runOnOperation() override; + +private: + SmallVector getTileShape(TypedValue value) const; + std::optional> getTileShape(Operation *op) const; + bool needsUnroll(Operation *op) const; +}; +} // namespace + +SmallVector +XeGPUInstructionlizePass::getTileShape(TypedValue value) const { + assert(value && "value must be non-null"); + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value); + if (layout && layout.isSgLayout()) { + if (auto inst_data = layout.getInstData()) + return llvm::to_vector_of(inst_data.asArrayRef()); + } + return llvm::to_vector(value.getType().getShape()); +} + +std::optional> +XeGPUInstructionlizePass::getTileShape(Operation *op) const { + if (isa(op)) + return getTileShape(cast>(op->getResult(0))); + if (isa(op)) + return getTileShape(cast>(op->getOperand(0))); + if (isa(op)) + return getTileShape(cast>(op->getOperand(1))); + + if (isa(op)) { + auto a = cast>(op->getOperand(0)); + auto b = cast>(op->getOperand(1)); + SmallVector aTileShape = getTileShape(a); + SmallVector bTileShape = getTileShape(b); + + if (aTileShape.size() != 2 || bTileShape.size() != 2) + return std::nullopt; + + // semantic check for A and B + if (aTileShape[1] != bTileShape[0]) + return std::nullopt; + + // semantic check for C + if (op->getNumOperands() == 3) { + auto c = cast>(op->getOperand(2)); + SmallVector cTileShape = getTileShape(c); + int64_t expectedShape[2] = {aTileShape[0], bTileShape[1]}; + if (!llvm::equal(cTileShape, expectedShape)) + return std::nullopt; + } + + return SmallVector({aTileShape[0], aTileShape[1], bTileShape[1]}); + } + return std::nullopt; +} + +bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { + for (Value opr : op->getOperands()) { + if (auto value = dyn_cast>(opr)) { + auto tileShape = getTileShape(value); + // the tile should have the same rank as the origial type + if (tileShape.size() != static_cast(value.getType().getRank())) + return false; + if (!llvm::equal(tileShape, value.getType().getShape())) + return true; + } + } + return false; +} + +void XeGPUInstructionlizePass::runOnOperation() { + MLIRContext *ctx = &getContext(); + xegpu::UnrollOptions options; + options.setFilterConstraint([&](Operation *op) -> LogicalResult { + return needsUnroll(op) ? success() : failure(); + }); + + options.setNativeShapeFn( + [&](Operation *op) -> std::optional> { + return getTileShape(op); + }); + + options.setUnrolledTypesFn( + [&](ShapedType type, ArrayRef tileShape) -> SmallVector { + Type elemTy = type.getElementType(); + Type newTy; + + if (auto tdescTy = dyn_cast(type)) + newTy = xegpu::TensorDescType::get( + ctx, tileShape, elemTy, tdescTy.getEncoding(), + tdescTy.getLayoutAttr().dropInstData()); + else + newTy = type.clone(tileShape, elemTy); + + std::optional> ratio = + computeShapeRatio(type.getShape(), tileShape); + assert(ratio && + "The shape of the type must be a multiple of tileShape."); + return SmallVector(computeProduct(*ratio), newTy); + }); + + RewritePatternSet patterns(ctx); + + populateXeGPUUnrollPatterns(patterns, options); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); +} From ab448a34294bf2333af8ed52e6d4db540706d20f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 13 May 2025 18:45:16 +0000 Subject: [PATCH 05/41] add scf type conversion util --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 5 + .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 41 ++-- mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt | 1 + mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 182 ++++++++++++++++++ 4 files changed, 215 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 5c2a308887040..4bcda3e3ac95f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -65,6 +65,11 @@ std::string getLayoutName(OpOperand &opr); /// Retrieves the name for the LayoutAttr associated with a given OpResult. std::string getLayoutName(OpResult res); +/// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType +/// cannot carry the layout attribute, they are converted into RankedTensorType +/// first, which will convert back to VectorType in the second round. +void doSCFStructuralTypeConversionWithTensorType(Operation *op); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index b83ce86a357f0..efc44aadb14e6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -38,21 +38,33 @@ class XeGPUInstructionlizePass final void runOnOperation() override; private: - SmallVector getTileShape(TypedValue value) const; + // Get the tile shape for a given value. If the value has a layout + // attribute and it is an SG layout, return the inst_data as the tile shape + // if inst_data is available; otherwise, return the original shape of the + // value. If the value does not have an SG layout, return std::nullopt. + std::optional> + getTileShape(TypedValue value) const; + + // Get the tile shape for a given operation. std::optional> getTileShape(Operation *op) const; + + // Determine if the operation requires unrolling. Return false if all operands + // and results have tile shapes identical to their original types. Otherwise, + // return true. bool needsUnroll(Operation *op) const; }; } // namespace -SmallVector +std::optional> XeGPUInstructionlizePass::getTileShape(TypedValue value) const { assert(value && "value must be non-null"); xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value); if (layout && layout.isSgLayout()) { if (auto inst_data = layout.getInstData()) return llvm::to_vector_of(inst_data.asArrayRef()); + return llvm::to_vector(value.getType().getShape()); } - return llvm::to_vector(value.getType().getShape()); + return std::nullopt; } std::optional> @@ -67,26 +79,26 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { if (isa(op)) { auto a = cast>(op->getOperand(0)); auto b = cast>(op->getOperand(1)); - SmallVector aTileShape = getTileShape(a); - SmallVector bTileShape = getTileShape(b); + std::optional> aTile = getTileShape(a); + std::optional> bTile = getTileShape(b); - if (aTileShape.size() != 2 || bTileShape.size() != 2) + if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2) return std::nullopt; // semantic check for A and B - if (aTileShape[1] != bTileShape[0]) + if ((*aTile)[1] != (*bTile)[0]) return std::nullopt; // semantic check for C if (op->getNumOperands() == 3) { auto c = cast>(op->getOperand(2)); - SmallVector cTileShape = getTileShape(c); - int64_t expectedShape[2] = {aTileShape[0], bTileShape[1]}; - if (!llvm::equal(cTileShape, expectedShape)) + std::optional> cTile = getTileShape(c); + int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]}; + if (!cTile || !llvm::equal(*cTile, expectedCTile)) return std::nullopt; } - return SmallVector({aTileShape[0], aTileShape[1], bTileShape[1]}); + return SmallVector({(*aTile)[0], (*aTile)[1], (*bTile)[1]}); } return std::nullopt; } @@ -94,11 +106,12 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { for (Value opr : op->getOperands()) { if (auto value = dyn_cast>(opr)) { - auto tileShape = getTileShape(value); + std::optional> tileShape = getTileShape(value); // the tile should have the same rank as the origial type - if (tileShape.size() != static_cast(value.getType().getRank())) + if (!tileShape || + tileShape->size() != static_cast(value.getType().getRank())) return false; - if (!llvm::equal(tileShape, value.getType().getShape())) + if (!llvm::equal(*tileShape, value.getType().getShape())) return true; } } diff --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt index afd8e2d5c4df3..98e84a4420722 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt @@ -6,5 +6,6 @@ add_mlir_dialect_library(MLIRXeGPUUtils LINK_LIBS PUBLIC MLIRIR + MLIRSCFTransforms MLIRXeGPUDialect ) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 285a15062e402..e43aac4ce8dc0 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -11,9 +11,12 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" #include #include @@ -127,3 +130,182 @@ std::string xegpu::getLayoutName(OpResult res) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); } + +void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { + MLIRContext *context = op->getContext(); + + auto materializeCast = [&](OpBuilder &builder, Type type, ValueRange inputs, + Location loc) -> Value { + return builder.create(loc, type, inputs) + .getResult(0); + }; + + { // convert VectorType to RankedTensorType for SCF Structural ops + TypeConverter converter; + converter.addConversion([&](Type type) -> Type { return type; }); + converter.addConversion([&](VectorType type) -> Type { + return RankedTensorType::get(type.getShape(), type.getElementType()); + }); + converter.addSourceMaterialization(materializeCast); + converter.addTargetMaterialization(materializeCast); + + mlir::ConversionTarget target(*context); + target.addLegalOp(); + + mlir::RewritePatternSet patterns(context); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + (void)mlir::applyPartialConversion(op, target, std::move(patterns)); + } + + { // propagate the layout attribute to RankedTensorType by checking + // BuiltInUnrealizedCastOps + // for VectorType to RankedTensorType cast. + op->walk([&](UnrealizedConversionCastOp castOp) { + if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1) + return WalkResult::skip(); + + Value input = castOp.getInputs()[0]; + Value result = castOp.getResults()[0]; + auto inputTy = dyn_cast(input.getType()); + auto resultTy = dyn_cast(result.getType()); + + // Only look at ops casting from VectorType to RankedTensorType + if (!isa(inputTy) || !isa(resultTy)) + return WalkResult::skip(); + + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(input); + if (!layout) + return WalkResult::skip(); + + RankedTensorType newTy = resultTy.cloneWithEncoding(layout); + result.setType(newTy); + + // update the arguments if user is a LoopLike op. + for (OpOperand &use : result.getUses()) { + if (auto loop = dyn_cast(use.getOwner())) { + BlockArgument arg = loop.getTiedLoopRegionIterArg(&use); + arg.setType(newTy); + } + // whileOp has two regions, the BlockArgument of the after region + // is not exposed by LoopLikeOpInterface + if (auto whileOp = dyn_cast(use.getOwner())) { + unsigned idx = use.getOperandNumber(); + BlockArgument arg = whileOp.getAfterArguments()[idx]; + arg.setType(newTy); + } + } + return WalkResult::advance(); + }); + + // using yieldOp as anchor to update the result type of its ParentOp + op->walk([&](scf::YieldOp yieldOp) { + Operation *parentOp = yieldOp->getParentOp(); + for (OpResult r : parentOp->getOpResults()) { + unsigned idx = r.getResultNumber(); + Type resultTy = r.getType(); + Type yieldTy = yieldOp.getResults()[idx].getType(); + if (isa(resultTy) && yieldTy != resultTy) + r.setType(yieldTy); + } + }); + } + + { // perform the conversion from RankedTensorType to VectorType based on the + // LayoutAttr + + auto computeTileShapeAndCount = [&](ArrayRef shape, + DenseI32ArrayAttr sgDataAttr, + DenseI32ArrayAttr sgLayoutAttr) { + SmallVector tileShape; + auto sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); + if (sgDataAttr) + tileShape = llvm::to_vector_of(sgDataAttr.asArrayRef()); + else + tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape); + assert(tileShape.size() && "failed to compute tileShape"); + SmallVector distUnit = + computeElementwiseMul(sgLayout, tileShape); + int count = computeProduct(shape) / computeProduct(distUnit); + return std::make_pair(tileShape, count); + }; + + TypeConverter converter; + converter.addConversion([&](Type type) -> Type { return type; }); + converter.addConversion( + [&](RankedTensorType type, + SmallVectorImpl &result) -> std::optional { + ArrayRef shape = type.getShape(); + auto encoding = type.getEncoding(); + Type elemTy = type.getElementType(); + + // init count and subShape to the default value. If the LayoutAttr + // is not present, it will return a VectorType with original shape. + int count = 1; + SmallVector subShape(shape); + + if (auto layout = + llvm::dyn_cast_if_present(encoding)) { + if (layout.isWgLayout()) { + // for WgToSg, the subShape is either from sgData or computed as + // shape/sgLayout + std::tie(subShape, count) = computeTileShapeAndCount( + shape, layout.getSgData(), layout.getSgLayout()); + } else if (DenseI32ArrayAttr instData = layout.getInstData()) { + // for unrolling, the subShape is determined by inst_data + subShape = llvm::to_vector_of(instData.asArrayRef()); + count = computeProduct(shape) / computeProduct(subShape); + } + } + auto newTy = VectorType::get(subShape, elemTy); + result.append(count, newTy); + return success(); + }); + + converter.addConversion( + [&](xegpu::TensorDescType type, + SmallVectorImpl &result) -> std::optional { + MLIRContext *ctx = type.getContext(); + Type elemTy = type.getElementType(); + Attribute encoding = type.getEncoding(); + ArrayRef shape = type.getShape(); + + // init count and newTy to the default value. If the layout attribute + // is not present, it will return the original type. + int count = 1; + Type newTy = type; + + if (xegpu::LayoutAttr layout = type.getLayoutAttr()) { + SmallVector subShape, distUnit; + if (layout.isWgLayout()) { + // for WgToSg, the subShape is either from sgData or computed as + // shape/sgLayout + std::tie(subShape, count) = computeTileShapeAndCount( + shape, layout.getSgData(), layout.getSgLayout()); + layout = layout.dropSgLayoutAndData(); + } else if (DenseI32ArrayAttr instData = layout.getInstData()) { + // for unrolling, the subShape is determined by inst_data + subShape = llvm::to_vector_of(instData.asArrayRef()); + count = computeProduct(shape) / computeProduct(subShape); + layout = layout.dropInstData(); + } + newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, + layout); + } + + result.append(count, newTy); + return success(); + }); + + converter.addSourceMaterialization(materializeCast); + converter.addTargetMaterialization(materializeCast); + + mlir::ConversionTarget target(*context); + target.addLegalOp(); + + mlir::RewritePatternSet patterns(context); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + (void)mlir::applyPartialConversion(op, target, std::move(patterns)); + } +} From 7b5e8f1193006591062592f5e8858c33113448fe Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 13 May 2025 20:02:45 +0000 Subject: [PATCH 06/41] partial working --- .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 16 +++++++++++----- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index efc44aadb14e6..737600fe909fa 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -120,18 +120,22 @@ bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { void XeGPUInstructionlizePass::runOnOperation() { MLIRContext *ctx = &getContext(); + Operation *op = getOperation(); + + // first perform type conversion for SCF control folow ops + xegpu::doSCFStructuralTypeConversionWithTensorType(op); + xegpu::UnrollOptions options; options.setFilterConstraint([&](Operation *op) -> LogicalResult { return needsUnroll(op) ? success() : failure(); }); - options.setNativeShapeFn( - [&](Operation *op) -> std::optional> { + options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); options.setUnrolledTypesFn( - [&](ShapedType type, ArrayRef tileShape) -> SmallVector { + [&](ShapedType type, ArrayRef tileShape) { Type elemTy = type.getElementType(); Type newTy; @@ -149,8 +153,10 @@ void XeGPUInstructionlizePass::runOnOperation() { return SmallVector(computeProduct(*ratio), newTy); }); - RewritePatternSet patterns(ctx); + GreedyRewriteConfig config; + config.setStrictness(GreedyRewriteStrictness::ExistingOps); + RewritePatternSet patterns(ctx); populateXeGPUUnrollPatterns(patterns, options); - (void)applyPatternsGreedily(getOperation(), std::move(patterns)); + (void)applyPatternsGreedily(getOperation(), std::move(patterns), config); } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index e43aac4ce8dc0..cb2c4d40f8a6d 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -215,8 +215,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { // LayoutAttr auto computeTileShapeAndCount = [&](ArrayRef shape, - DenseI32ArrayAttr sgDataAttr, - DenseI32ArrayAttr sgLayoutAttr) { + DenseI32ArrayAttr sgDataAttr, + DenseI32ArrayAttr sgLayoutAttr) { SmallVector tileShape; auto sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); if (sgDataAttr) @@ -224,8 +224,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { else tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape); assert(tileShape.size() && "failed to compute tileShape"); - SmallVector distUnit = - computeElementwiseMul(sgLayout, tileShape); + SmallVector distUnit = computeElementwiseMul(sgLayout, tileShape); int count = computeProduct(shape) / computeProduct(distUnit); return std::make_pair(tileShape, count); }; @@ -249,8 +248,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { if (layout.isWgLayout()) { // for WgToSg, the subShape is either from sgData or computed as // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount( - shape, layout.getSgData(), layout.getSgLayout()); + std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout()); } else if (DenseI32ArrayAttr instData = layout.getInstData()) { // for unrolling, the subShape is determined by inst_data subShape = llvm::to_vector_of(instData.asArrayRef()); @@ -280,8 +278,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { if (layout.isWgLayout()) { // for WgToSg, the subShape is either from sgData or computed as // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount( - shape, layout.getSgData(), layout.getSgLayout()); + std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout()); layout = layout.dropSgLayoutAndData(); } else if (DenseI32ArrayAttr instData = layout.getInstData()) { // for unrolling, the subShape is determined by inst_data @@ -298,7 +295,11 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { }); converter.addSourceMaterialization(materializeCast); - converter.addTargetMaterialization(materializeCast); + converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, + ValueRange inputs, Location loc) { + return builder.create(loc, type, inputs) + .getResults(); + }); mlir::ConversionTarget target(*context); target.addLegalOp(); From e2eb9e63df30e9e84d3d09060ec493bc2b805f3d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 15 May 2025 21:22:16 +0000 Subject: [PATCH 07/41] refactor pack and unpack --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 39 ++++- .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 163 +++++++++++++----- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 25 +-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 152 +++++++++++++++- 4 files changed, 301 insertions(+), 78 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 4bcda3e3ac95f..b41da0ea6a276 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -15,6 +15,8 @@ namespace mlir { class VectorType; class OpOperand; class OpResult; +class OpBuilder; +class ValueRange; namespace xegpu { class LayoutAttr; @@ -53,17 +55,46 @@ FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); +/// Return the attribute name for the OpOperand to attach LayoutAttr +std::string getLayoutName(OpOperand &opr); + +/// Return the attribute name for the OpResult to attach LayoutAttr +std::string getLayoutName(OpResult res); + /// Retrieves the LayoutAttr associated with a given Value. For TensorDescType /// values, the LayoutAttr is extracted from the TensorDescType itself. For /// other values, it is obtained from the attributes of the defining operation. /// Returns nullptr if no LayoutAttr is found. LayoutAttr getLayoutAttr(Value value); -/// Retrieves the name for the LayoutAttr associated with a given OpOperand. -std::string getLayoutName(OpOperand &opr); +/// Retrieves the LayoutAttr associated with a given OpOperand. It will +/// first check the operand_layout_{id} of the owner operation. If not found, +/// it will check the operand itself and its defining op. +LayoutAttr getLayoutAttr(OpOperand &opr); -/// Retrieves the name for the LayoutAttr associated with a given OpResult. -std::string getLayoutName(OpResult res); +/// Sets the LayoutAttr for a given OpOperand by attaching it to the owner +void setLayoutAttr(OpOperand &opr, LayoutAttr layout); + +/// Set the LayoutAttr for the given OpResult by attching it to the defining op +void setLayoutAttr(OpResult result, LayoutAttr layout); + +/// Set the LayoutAttr for each OpOperand and OpResult of the given operation. +/// If the operation contains regions, it is also applied recursively to the +/// contained operations +void setLayoutAttrs(Operation *op, + function_ref getLayoutImpl); + +/// Extract a set of small vectors from a value with a given shape using +/// vector.extract_stride_slice +SmallVector extractVectorsWithShapeFromValue(OpBuilder &builder, + Location loc, Value value, + ArrayRef shape); + +/// Create a vector of shape from a set of values using +/// vector.insert_stride_slice. +Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, + ValueRange values, + ArrayRef shape); /// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType /// cannot carry the layout attribute, they are converted into RankedTensorType diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index 737600fe909fa..0e01c7e4d9763 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -45,6 +46,10 @@ class XeGPUInstructionlizePass final std::optional> getTileShape(TypedValue value) const; + std::optional> getTileShape(OpOperand &operand) const; + + std::optional> getTileShape(OpResult result) const; + // Get the tile shape for a given operation. std::optional> getTileShape(Operation *op) const; @@ -67,20 +72,46 @@ XeGPUInstructionlizePass::getTileShape(TypedValue value) const { return std::nullopt; } +std::optional> +XeGPUInstructionlizePass::getTileShape(OpOperand &operand) const { + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); + if (layout && layout.isSgLayout()) { + if (auto inst_data = layout.getInstData()) + return llvm::to_vector_of(inst_data.asArrayRef()); + + if (auto type = dyn_cast(operand.get().getType())) + return llvm::to_vector(type.getShape()); + } + return std::nullopt; +} + +std::optional> +XeGPUInstructionlizePass::getTileShape(OpResult result) const { + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); + if (layout && layout.isSgLayout()) { + if (auto inst_data = layout.getInstData()) + return llvm::to_vector_of(inst_data.asArrayRef()); + + if (auto type = dyn_cast(result.getType())) + return llvm::to_vector(type.getShape()); + } + return std::nullopt; +} + std::optional> XeGPUInstructionlizePass::getTileShape(Operation *op) const { if (isa(op)) - return getTileShape(cast>(op->getResult(0))); + return getTileShape(op->getOpResult(0)); if (isa(op)) - return getTileShape(cast>(op->getOperand(0))); + return getTileShape(op->getOpOperand(0)); if (isa(op)) - return getTileShape(cast>(op->getOperand(1))); + return getTileShape(op->getOpOperand(1)); if (isa(op)) { - auto a = cast>(op->getOperand(0)); - auto b = cast>(op->getOperand(1)); - std::optional> aTile = getTileShape(a); - std::optional> bTile = getTileShape(b); + std::optional> aTile = + getTileShape(op->getOpOperand(0)); + std::optional> bTile = + getTileShape(op->getOpOperand(1)); if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2) return std::nullopt; @@ -91,8 +122,8 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { // semantic check for C if (op->getNumOperands() == 3) { - auto c = cast>(op->getOperand(2)); - std::optional> cTile = getTileShape(c); + std::optional> cTile = + getTileShape(op->getOpOperand(2)); int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]}; if (!cTile || !llvm::equal(*cTile, expectedCTile)) return std::nullopt; @@ -104,59 +135,101 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { } bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { - for (Value opr : op->getOperands()) { - if (auto value = dyn_cast>(opr)) { - std::optional> tileShape = getTileShape(value); - // the tile should have the same rank as the origial type - if (!tileShape || - tileShape->size() != static_cast(value.getType().getRank())) - return false; - if (!llvm::equal(*tileShape, value.getType().getShape())) - return true; - } + if (isa(op)) + return false; + + for (auto &opr : op->getOpOperands()) { + std::optional> tileShape = getTileShape(opr); + auto shapedType = dyn_cast(opr.get().getType()); + if (!shapedType) + continue; + + if (tileShape && !llvm::equal(*tileShape, shapedType.getShape())) + return true; + } + + for (auto result : op->getOpResults()) { + std::optional> tileShape = getTileShape(result); + auto shapedType = dyn_cast(result.getType()); + if (!shapedType) + continue; + + if (tileShape && !llvm::equal(*tileShape, shapedType.getShape())) + return true; } return false; } void XeGPUInstructionlizePass::runOnOperation() { MLIRContext *ctx = &getContext(); - Operation *op = getOperation(); + Operation *mod = getOperation(); + + // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. + // This ensures that the LayoutAttr remains accessible even if the defining + // operation is replaced. + xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); }); - // first perform type conversion for SCF control folow ops - xegpu::doSCFStructuralTypeConversionWithTensorType(op); + // Perform type conversion for SCF control folow ops + xegpu::doSCFStructuralTypeConversionWithTensorType(mod); xegpu::UnrollOptions options; options.setFilterConstraint([&](Operation *op) -> LogicalResult { return needsUnroll(op) ? success() : failure(); }); - options.setNativeShapeFn([&](Operation *op) { - return getTileShape(op); - }); + options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); - options.setUnrolledTypesFn( - [&](ShapedType type, ArrayRef tileShape) { - Type elemTy = type.getElementType(); - Type newTy; + options.setUnrolledTypesFn([&](ShapedType type, ArrayRef tileShape) { + Type elemTy = type.getElementType(); + Type newTy; - if (auto tdescTy = dyn_cast(type)) - newTy = xegpu::TensorDescType::get( - ctx, tileShape, elemTy, tdescTy.getEncoding(), - tdescTy.getLayoutAttr().dropInstData()); - else - newTy = type.clone(tileShape, elemTy); + if (auto tdescTy = dyn_cast(type)) + newTy = xegpu::TensorDescType::get( + ctx, tileShape, elemTy, tdescTy.getEncoding(), + tdescTy.getLayoutAttr().dropInstData()); + else + newTy = type.clone(tileShape, elemTy); - std::optional> ratio = - computeShapeRatio(type.getShape(), tileShape); - assert(ratio && - "The shape of the type must be a multiple of tileShape."); - return SmallVector(computeProduct(*ratio), newTy); - }); - - GreedyRewriteConfig config; - config.setStrictness(GreedyRewriteStrictness::ExistingOps); + std::optional> ratio = + computeShapeRatio(type.getShape(), tileShape); + assert(ratio && "The shape of the type must be a multiple of tileShape."); + return SmallVector(computeProduct(*ratio), newTy); + }); RewritePatternSet patterns(ctx); populateXeGPUUnrollPatterns(patterns, options); - (void)applyPatternsGreedily(getOperation(), std::move(patterns), config); + (void)applyPatternsGreedily(mod, std::move(patterns)); + + mod->walk([&](UnrealizedConversionCastOp castOp) { + ValueRange inputs = castOp.getInputs(); + ValueRange outputs = castOp.getOutputs(); + + if (inputs.size() == 1 && outputs.size() == 1) { + castOp->replaceAllUsesWith(inputs); + castOp->erase(); + } + + VectorType inputTy = dyn_cast(inputs[0].getType()); + VectorType outputTy = dyn_cast(outputs[0].getType()); + if (inputTy && outputTy) { + OpBuilder builder(castOp); + // unpack + if (inputs.size() > 1 && outputs.size() == 1) { + ArrayRef shape = outputTy.getShape(); + Value result = xegpu::createVectorWithShapeFromValues( + builder, castOp.getLoc(), inputs, shape); + castOp->replaceAllUsesWith(ValueRange(result)); + castOp->erase(); + } + + // pack + if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { + ArrayRef tileShape = outputTy.getShape(); + SmallVector results = xegpu::extractVectorsWithShapeFromValue( + builder, castOp.getLoc(), inputs[0], tileShape); + castOp->replaceAllUsesWith(results); + castOp->erase(); + } + } + }); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 44d45dd2eaec0..d9f69158f95eb 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" @@ -74,17 +75,7 @@ struct UnrollPattern : public OpRewritePattern { assert(vecTy.getRank() == static_cast(blockSize.size()) && "Expecting blockSize size to match the rank of destTy."); auto shape = vecTy.getShape(); - auto zeroAttr = rewriter.getZeroAttr(vecTy.getElementType()); - - Value result = rewriter.create( - loc, vecTy, DenseElementsAttr::get(vecTy, zeroAttr)); - for (auto [src, offsets] : - llvm::zip_equal(srcs, StaticTileOffsetRange(shape, blockSize))) { - SmallVector staticStrides(offsets.size(), 1); - result = rewriter.create( - loc, src, result, offsets, staticStrides); - } - return result; + return xegpu::createVectorWithShapeFromValues(rewriter, loc, srcs, shape); } if (isa(destTy)) { @@ -109,16 +100,8 @@ struct UnrollPattern : public OpRewritePattern { if (auto vecTy = dyn_cast(src.getType())) { assert(vecTy.getRank() == static_cast(blockSize.size()) && "Expecting blockSize size to match the rank of src."); - auto shape = vecTy.getShape(); - SmallVector results; - for (SmallVector offsets : - StaticTileOffsetRange(shape, blockSize)) { - SmallVector staticStrides(offsets.size(), 1); - auto slice = rewriter.create( - loc, src, offsets, blockSize, staticStrides); - results.push_back(slice); - } - return results; + return xegpu::extractVectorsWithShapeFromValue(rewriter, loc, src, + blockSize); } if (isa(src.getType())) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index cb2c4d40f8a6d..60c8493f552d8 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -14,15 +14,26 @@ #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/Operation.h" +#include "mlir/IR/ValueRange.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" #include #include using namespace mlir; +/// convert ArrayRef into SmallVector +static SmallVector flattenValues(ArrayRef values) { + SmallVector result; + for (const auto &vals : values) + llvm::append_range(result, vals); + return result; +} + FailureOr mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) { auto layout = llvm::dyn_cast_if_present(tdescTy.getLayout()); @@ -90,6 +101,16 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } +std::string xegpu::getLayoutName(OpOperand &opr) { + const StringRef prefix("layout_operand_"); + return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str(); +} + +std::string xegpu::getLayoutName(OpResult res) { + const StringRef prefix = "layout_result_"; + return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); +} + xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { if (!value) return nullptr; @@ -121,14 +142,86 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { return nullptr; } -std::string xegpu::getLayoutName(OpOperand &opr) { - const StringRef prefix("layout_operand_"); - return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str(); +xegpu::LayoutAttr xegpu::getLayoutAttr(OpOperand &opr) { + Operation *op = opr.getOwner(); + std::string layoutName = xegpu::getLayoutName(opr); + if (op->hasAttr(layoutName)) + return op->getAttrOfType(layoutName); + return getLayoutAttr(opr.get()); } -std::string xegpu::getLayoutName(OpResult res) { - const StringRef prefix = "layout_result_"; - return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); +void xegpu::setLayoutAttr(OpOperand &opr, LayoutAttr layout) { + auto owner = opr.getOwner(); + std::string name = xegpu::getLayoutName(opr); + if (layout && !owner->hasAttrOfType(name)) + owner->setAttr(name, layout); +} + +void xegpu::setLayoutAttr(OpResult result, LayoutAttr layout) { + Operation *owner = result.getOwner(); + std::string name = xegpu::getLayoutName(result); + if (layout && !owner->hasAttr(name)) + owner->setAttr(name, layout); +} + +void xegpu::setLayoutAttrs(Operation *mod, + function_ref getLayoutImpl) { + mod->walk([&](Operation *op) { + for (OpResult result : op->getOpResults()) { + auto layout = getLayoutImpl(result); + setLayoutAttr(result, layout); + } + for (OpOperand &opr : op->getOpOperands()) { + auto layout = getLayoutImpl(opr.get()); + setLayoutAttr(opr, layout); + } + }); +} + +SmallVector +xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, + Value value, ArrayRef shape) { + auto vecTy = dyn_cast(value.getType()); + if (!vecTy) + return {value}; + + ArrayRef srcShape = vecTy.getShape(); + if (!computeShapeRatio(srcShape, shape)) + return {value}; + + SmallVector result; + for (SmallVector offsets : StaticTileOffsetRange(srcShape, shape)) { + SmallVector staticStrides(offsets.size(), 1); + result.push_back(builder.create( + loc, value, offsets, shape, staticStrides)); + } + + return result; +} + +Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, + ValueRange values, + ArrayRef shape) { + VectorType inputTy = dyn_cast(values[0].getType()); + assert(llvm::all_of(values.getTypes(), + [&](Type type) { return type == inputTy; }) && + "values must be of the same VectorType"); + + Type elemTy = inputTy.getElementType(); + ArrayRef tileShape = inputTy.getShape(); + + VectorType resultTy = VectorType::get(shape, elemTy); + auto zeroAttr = builder.getZeroAttr(elemTy); + Value result = builder.create( + loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr)); + + for (auto [src, offsets] : + llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) { + SmallVector staticStrides(offsets.size(), 1); + result = builder.create( + loc, src, result, offsets, staticStrides); + } + return result; } void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { @@ -213,7 +306,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { { // perform the conversion from RankedTensorType to VectorType based on the // LayoutAttr - auto computeTileShapeAndCount = [&](ArrayRef shape, DenseI32ArrayAttr sgDataAttr, DenseI32ArrayAttr sgLayoutAttr) { @@ -302,9 +394,53 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { }); mlir::ConversionTarget target(*context); - target.addLegalOp(); + target.addDynamicallyLegalOp( + [&](UnrealizedConversionCastOp op) { + auto isTensorTy = [&](Type type) { + return isa(type); + }; + if (llvm::any_of(op->getOperandTypes(), isTensorTy) || + llvm::any_of(op->getResultTypes(), isTensorTy)) + return false; + return true; + }); + + class UnrealizedConversionCastOpPattern + : public OpConversionPattern { + using OpConversionPattern< + mlir::UnrealizedConversionCastOp>::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(mlir::UnrealizedConversionCastOp op, + OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto inputs = op.getOperands(); + auto outputs = op.getOutputs(); + + if (inputs.size() != 1 || outputs.size() != 1) + return failure(); + + auto inputTy = inputs[0].getType(); + auto outputTy = outputs[0].getType(); + + if (isa(inputTy) && isa(outputTy)) { + rewriter.replaceOpWithMultiple(op, adaptor.getInputs()); + return success(); + } + + if (isa(inputTy) && isa(outputTy)) { + SmallVector values = flattenValues(adaptor.getInputs()); + auto newOp = rewriter.create( + op.getLoc(), outputTy, values); + rewriter.replaceOp(op, newOp); + return success(); + } + return failure(); + } + }; mlir::RewritePatternSet patterns(context); + patterns.insert(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); From 6ec3604310f3abf10d576162b14e0820839056e5 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 15 May 2025 23:42:54 +0000 Subject: [PATCH 08/41] cleanup layout attr --- .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 72 ++++++++++++------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 6 +- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index 0e01c7e4d9763..fba0f882ef632 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -32,6 +32,39 @@ using namespace mlir; namespace { +void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { + ValueRange inputs = castOp.getInputs(); + ValueRange outputs = castOp.getOutputs(); + + if (inputs.size() == 1 && outputs.size() == 1) { + castOp->replaceAllUsesWith(inputs); + castOp->erase(); + } + + VectorType inputTy = dyn_cast(inputs[0].getType()); + VectorType outputTy = dyn_cast(outputs[0].getType()); + if (inputTy && outputTy) { + OpBuilder builder(castOp); + // unpack + if (inputs.size() > 1 && outputs.size() == 1) { + ArrayRef shape = outputTy.getShape(); + Value result = xegpu::createVectorWithShapeFromValues( + builder, castOp.getLoc(), inputs, shape); + castOp->replaceAllUsesWith(ValueRange(result)); + castOp->erase(); + } + + // pack + if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { + ArrayRef tileShape = outputTy.getShape(); + SmallVector results = xegpu::extractVectorsWithShapeFromValue( + builder, castOp.getLoc(), inputs[0], tileShape); + castOp->replaceAllUsesWith(results); + castOp->erase(); + } + } +} + /// Unroll XeGPU ops to their instruction-level representation. class XeGPUInstructionlizePass final : public xegpu::impl::XeGPUInstructionlizeBase { @@ -200,35 +233,22 @@ void XeGPUInstructionlizePass::runOnOperation() { populateXeGPUUnrollPatterns(patterns, options); (void)applyPatternsGreedily(mod, std::move(patterns)); - mod->walk([&](UnrealizedConversionCastOp castOp) { - ValueRange inputs = castOp.getInputs(); - ValueRange outputs = castOp.getOutputs(); + mod->walk([&](Operation *op) { + if (auto castOp = dyn_cast(op)) + resolveUnrealizedConversionCastOp(castOp); - if (inputs.size() == 1 && outputs.size() == 1) { - castOp->replaceAllUsesWith(inputs); - castOp->erase(); + for (OpOperand &opr : op->getOpOperands()) { + std::string name = xegpu::getLayoutName(opr); + if (auto layout = op->getAttrOfType(name)) + op->removeAttr(name); } - VectorType inputTy = dyn_cast(inputs[0].getType()); - VectorType outputTy = dyn_cast(outputs[0].getType()); - if (inputTy && outputTy) { - OpBuilder builder(castOp); - // unpack - if (inputs.size() > 1 && outputs.size() == 1) { - ArrayRef shape = outputTy.getShape(); - Value result = xegpu::createVectorWithShapeFromValues( - builder, castOp.getLoc(), inputs, shape); - castOp->replaceAllUsesWith(ValueRange(result)); - castOp->erase(); - } - - // pack - if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { - ArrayRef tileShape = outputTy.getShape(); - SmallVector results = xegpu::extractVectorsWithShapeFromValue( - builder, castOp.getLoc(), inputs[0], tileShape); - castOp->replaceAllUsesWith(results); - castOp->erase(); + for (OpResult result : op->getOpResults()) { + std::string name = xegpu::getLayoutName(result); + if (auto layout = op->getAttrOfType(name)) { + op->removeAttr(name); + if (!isa(op)) + xegpu::setLayoutAttr(result, layout.dropInstData()); } } }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 60c8493f552d8..023e445206440 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -115,7 +115,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { if (!value) return nullptr; - if (auto tdescTy = dyn_cast(value.getType())) + if (auto tdescTy = + dyn_cast_if_present(value.getType())) return tdescTy.getLayoutAttr(); if (auto result = dyn_cast(value)) { @@ -366,7 +367,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { Type newTy = type; if (xegpu::LayoutAttr layout = type.getLayoutAttr()) { - SmallVector subShape, distUnit; + SmallVector subShape(shape); if (layout.isWgLayout()) { // for WgToSg, the subShape is either from sgData or computed as // shape/sgLayout @@ -378,6 +379,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { count = computeProduct(shape) / computeProduct(subShape); layout = layout.dropInstData(); } + newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, layout); } From bc69a8de7e0d436a7718fc2b30ee4bbd7861e5a4 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 14:10:26 +0000 Subject: [PATCH 09/41] check in elemwise support --- .../Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index fba0f882ef632..078b674de8d4f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -164,6 +164,10 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { return SmallVector({(*aTile)[0], (*aTile)[1], (*bTile)[1]}); } + + if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) + return getTileShape(op->getOpResult(0)); + return std::nullopt; } @@ -230,7 +234,14 @@ void XeGPUInstructionlizePass::runOnOperation() { }); RewritePatternSet patterns(ctx); + + vector::UnrollVectorOptions vectorOptions; + // vectorOptions.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); + vectorOptions.setNativeShapeFn(options.nativeShape); + populateXeGPUUnrollPatterns(patterns, options); + vector::populateVectorUnrollPatterns(patterns, vectorOptions); + (void)applyPatternsGreedily(mod, std::move(patterns)); mod->walk([&](Operation *op) { From 4fc75402332a5062eaa20b51f20ef54b4e5281ac Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 14:43:59 +0000 Subject: [PATCH 10/41] check in unit test --- .../Dialect/XeGPU/xegpu-instructionlize.mlir | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir diff --git a/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir b/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir new file mode 100644 index 0000000000000..888684789cc8c --- /dev/null +++ b/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir @@ -0,0 +1,123 @@ +// RUN: mlir-opt --xegpu-instructionlize -split-input-file %s | FileCheck %s + + +#a = #xegpu.layout +#b = #xegpu.layout +#c = #xegpu.layout + +#l1 = #xegpu.layout +#l2 = #xegpu.layout + +gpu.module @test_kernel { + gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c16 : index + %n = arith.muli %block_id_y, %c32 : index + + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) + -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> + scf.yield %a_next_tdesc, %b_next_tdesc, %c + : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> + } + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + gpu.return + } + + //----- + gpu.func @test_gemm_simple(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c16 : index + %n = arith.muli %block_id_y, %c32 : index + + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) + -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) { + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2> + scf.yield %a_next_tdesc, %b_next_tdesc, %c + : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> + } + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> + gpu.return + } + + //----- + + gpu.func @test_gemm_a_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c16 : index + %n = arith.muli %block_id_y, %c32 : index + + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) + -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> + %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> + //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> + scf.yield %a_next_tdesc, %b_next_tdesc, %c + : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> + } + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + gpu.return + }} From 132f15e7400b92b61801ca0bf013be66a95c54d1 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 15:06:25 +0000 Subject: [PATCH 11/41] fix format --- .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 1 - mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 15 +++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp index 078b674de8d4f..f0ebe2321f8f1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp @@ -236,7 +236,6 @@ void XeGPUInstructionlizePass::runOnOperation() { RewritePatternSet patterns(ctx); vector::UnrollVectorOptions vectorOptions; - // vectorOptions.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); vectorOptions.setNativeShapeFn(options.nativeShape); populateXeGPUUnrollPatterns(patterns, options); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 023e445206440..14b2b909e143a 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -308,8 +308,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { { // perform the conversion from RankedTensorType to VectorType based on the // LayoutAttr auto computeTileShapeAndCount = [&](ArrayRef shape, - DenseI32ArrayAttr sgDataAttr, - DenseI32ArrayAttr sgLayoutAttr) { + DenseI32ArrayAttr sgDataAttr, + DenseI32ArrayAttr sgLayoutAttr) { SmallVector tileShape; auto sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); if (sgDataAttr) @@ -317,7 +317,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { else tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape); assert(tileShape.size() && "failed to compute tileShape"); - SmallVector distUnit = computeElementwiseMul(sgLayout, tileShape); + SmallVector distUnit = + computeElementwiseMul(sgLayout, tileShape); int count = computeProduct(shape) / computeProduct(distUnit); return std::make_pair(tileShape, count); }; @@ -341,7 +342,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { if (layout.isWgLayout()) { // for WgToSg, the subShape is either from sgData or computed as // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout()); + std::tie(subShape, count) = computeTileShapeAndCount( + shape, layout.getSgData(), layout.getSgLayout()); } else if (DenseI32ArrayAttr instData = layout.getInstData()) { // for unrolling, the subShape is determined by inst_data subShape = llvm::to_vector_of(instData.asArrayRef()); @@ -371,7 +373,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { if (layout.isWgLayout()) { // for WgToSg, the subShape is either from sgData or computed as // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout()); + std::tie(subShape, count) = computeTileShapeAndCount( + shape, layout.getSgData(), layout.getSgLayout()); layout = layout.dropSgLayoutAndData(); } else if (DenseI32ArrayAttr instData = layout.getInstData()) { // for unrolling, the subShape is determined by inst_data @@ -390,7 +393,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { converter.addSourceMaterialization(materializeCast); converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, - ValueRange inputs, Location loc) { + ValueRange inputs, Location loc) { return builder.create(loc, type, inputs) .getResults(); }); From aa4ba9c32d9ca14daec16bc98b27e4bb9d1f5282 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 15:21:18 +0000 Subject: [PATCH 12/41] roll back pass name --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 2 +- .../Dialect/XeGPU/Transforms/CMakeLists.txt | 2 +- ...UInstructionlize.cpp => XeGPUBlocking.cpp} | 22 +++++++++---------- ...structionlize.mlir => xegpu-blocking.mlir} | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) rename mlir/lib/Dialect/XeGPU/Transforms/{XeGPUInstructionlize.cpp => XeGPUBlocking.cpp} (92%) rename mlir/test/Dialect/XeGPU/{xegpu-instructionlize.mlir => xegpu-blocking.mlir} (99%) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 54782933fe5f8..b3883605b74f2 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -38,7 +38,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { ]; } -def XeGPUInstructionlize: Pass<"xegpu-instructionlize"> { +def XeGPUBlocking: Pass<"xegpu-blocking"> { let summary = "Instructionlize XeGPU ops"; let description = [{ The pass unrolls XeGPU ops working on large shapes into ops working on small shapes diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 1d94b4c4c03ac..adbbdaac8fc06 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -1,6 +1,6 @@ add_mlir_dialect_library(MLIRXeGPUTransforms + XeGPUBlocking.cpp XeGPUFoldAliasOps.cpp - XeGPUInstructionlize.cpp XeGPUSubgroupDistribute.cpp XeGPUUnroll.cpp diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp similarity index 92% rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index f0ebe2321f8f1..1587cbdfed2cc 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -1,4 +1,4 @@ -//===---- XeGPUInstructionlize.cpp -- XeGPU Instructionlize Pass ----------===// +//===---- XeGPUBlocking.cpp ---- XeGPU Instructionlize Pass ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -20,12 +20,12 @@ namespace mlir { namespace xegpu { -#define GEN_PASS_DEF_XEGPUINSTRUCTIONLIZE +#define GEN_PASS_DEF_XEGPUBLOCKING #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" } // namespace xegpu } // namespace mlir -#define DEBUG_TYPE "xegpu-instructionlize" +#define DEBUG_TYPE "xegpu-blocking" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") using namespace mlir; @@ -66,8 +66,8 @@ void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } /// Unroll XeGPU ops to their instruction-level representation. -class XeGPUInstructionlizePass final - : public xegpu::impl::XeGPUInstructionlizeBase { +class XeGPUBlockingPass final + : public xegpu::impl::XeGPUBlockingBase { public: void runOnOperation() override; @@ -94,7 +94,7 @@ class XeGPUInstructionlizePass final } // namespace std::optional> -XeGPUInstructionlizePass::getTileShape(TypedValue value) const { +XeGPUBlockingPass::getTileShape(TypedValue value) const { assert(value && "value must be non-null"); xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value); if (layout && layout.isSgLayout()) { @@ -106,7 +106,7 @@ XeGPUInstructionlizePass::getTileShape(TypedValue value) const { } std::optional> -XeGPUInstructionlizePass::getTileShape(OpOperand &operand) const { +XeGPUBlockingPass::getTileShape(OpOperand &operand) const { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); if (layout && layout.isSgLayout()) { if (auto inst_data = layout.getInstData()) @@ -119,7 +119,7 @@ XeGPUInstructionlizePass::getTileShape(OpOperand &operand) const { } std::optional> -XeGPUInstructionlizePass::getTileShape(OpResult result) const { +XeGPUBlockingPass::getTileShape(OpResult result) const { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); if (layout && layout.isSgLayout()) { if (auto inst_data = layout.getInstData()) @@ -132,7 +132,7 @@ XeGPUInstructionlizePass::getTileShape(OpResult result) const { } std::optional> -XeGPUInstructionlizePass::getTileShape(Operation *op) const { +XeGPUBlockingPass::getTileShape(Operation *op) const { if (isa(op)) return getTileShape(op->getOpResult(0)); if (isa(op)) @@ -171,7 +171,7 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const { return std::nullopt; } -bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { +bool XeGPUBlockingPass::needsUnroll(Operation *op) const { if (isa(op)) return false; @@ -197,7 +197,7 @@ bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const { return false; } -void XeGPUInstructionlizePass::runOnOperation() { +void XeGPUBlockingPass::runOnOperation() { MLIRContext *ctx = &getContext(); Operation *mod = getOperation(); diff --git a/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir similarity index 99% rename from mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir rename to mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 888684789cc8c..c3db6b2abb7bd 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --xegpu-instructionlize -split-input-file %s | FileCheck %s +// RUN: mlir-opt --xegpu-blocking -split-input-file %s | FileCheck %s #a = #xegpu.layout From 061b6e00f3f0036a15790fea4e3ffd9b1def5bf4 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 16:37:25 +0000 Subject: [PATCH 13/41] add 1d and 2d elemwise test --- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 104 +++++++++++++++++--- 1 file changed, 93 insertions(+), 11 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index c3db6b2abb7bd..d8a5dfe7d4b13 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -1,13 +1,8 @@ // RUN: mlir-opt --xegpu-blocking -split-input-file %s | FileCheck %s - #a = #xegpu.layout #b = #xegpu.layout #c = #xegpu.layout - -#l1 = #xegpu.layout -#l2 = #xegpu.layout - gpu.module @test_kernel { gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index @@ -44,9 +39,13 @@ gpu.module @test_kernel { xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } +} - //----- - gpu.func @test_gemm_simple(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { +// ----- +#l1 = #xegpu.layout +#l2 = #xegpu.layout +gpu.module @test_kernel { + gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -81,10 +80,14 @@ gpu.module @test_kernel { xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> gpu.return } +} - //----- - - gpu.func @test_gemm_a_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { +// ----- +#a = #xegpu.layout +#b = #xegpu.layout +#c = #xegpu.layout +gpu.module @test_kernel { + gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -120,4 +123,83 @@ gpu.module @test_kernel { //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return - }} + } +} + +// ----- +#l = #xegpu.layout +gpu.module @test_kernel { + gpu.func @test_elementwise(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) { + //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + + //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16> + %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16> + + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> + + //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l> + } + gpu.return + } +} + +// ----- +#l = #xegpu.layout +gpu.module @test_kernel { + gpu.func @test_elementwise(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c32 : index + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + + %out:3 = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) + -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) { + //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + + //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16> + %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16> + + //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16> + xegpu.store_nd %c, %arg2: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> + + //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32] : !xegpu.tensor_desc<32xf16, #l> + %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c32] : !xegpu.tensor_desc<32xf16, #l> + scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc + : !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l> + } + gpu.return + } +} From 387ac9310f2ed10260f80be7c7d8c73ac529695c Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 22:40:43 +0000 Subject: [PATCH 14/41] refactor --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 11 +- .../XeGPU/Transforms/XeGPUBlocking.cpp | 59 +++++++- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 126 ++++-------------- 3 files changed, 88 insertions(+), 108 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index b41da0ea6a276..44faef00a739e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -17,6 +17,7 @@ class OpOperand; class OpResult; class OpBuilder; class ValueRange; +class TypeConverter; namespace xegpu { class LayoutAttr; @@ -96,10 +97,12 @@ Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef shape); -/// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType -/// cannot carry the layout attribute, they are converted into RankedTensorType -/// first, which will convert back to VectorType in the second round. -void doSCFStructuralTypeConversionWithTensorType(Operation *op); +/// Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type +/// convertion patterns. Since VectorType cannot carry the layout attribute, which is +/// needed to guide the type conversion for XeGPU, they are first converted into +/// RankedTensorType, where the layout attribute can be attached. And then upstream +/// SCF structural type conversion patterns are applied with the provided converter. +void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter); } // namespace xegpu diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 1587cbdfed2cc..d0adb860abca7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -16,6 +16,7 @@ #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" namespace mlir { @@ -207,7 +208,63 @@ void XeGPUBlockingPass::runOnOperation() { xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); }); // Perform type conversion for SCF control folow ops - xegpu::doSCFStructuralTypeConversionWithTensorType(mod); + TypeConverter converter; + converter.addConversion([&](Type type) -> Type { return type; }); + converter.addConversion( + [&](RankedTensorType type, + SmallVectorImpl &result) -> std::optional { + Type elemTy = type.getElementType(); + ArrayRef shape = type.getShape(); + + // init count and subShape to the default value. If the LayoutAttr + // is not present, it will return a VectorType with original shape. + int count = 1; + SmallVector subShape(shape); + if (auto layout = llvm::dyn_cast_if_present(type.getEncoding())) { + if (layout.isWgLayout()) + return failure(); + if (DenseI32ArrayAttr instData = layout.getInstData()) { + // for unrolling, the subShape is determined by inst_data + subShape = llvm::to_vector_of(instData.asArrayRef()); + count = computeProduct(shape) / computeProduct(subShape); + } + } + auto newTy = VectorType::get(subShape, elemTy); + result.append(count, newTy); + return success(); + }); + + converter.addConversion( + [&](xegpu::TensorDescType type, + SmallVectorImpl &result) -> std::optional { + MLIRContext *ctx = type.getContext(); + Type elemTy = type.getElementType(); + Attribute encoding = type.getEncoding(); + ArrayRef shape = type.getShape(); + + // init count and newTy to the default value. If the layout attribute + // is not present, it will return the original type. + int count = 1; + SmallVector subShape(shape); + + xegpu::LayoutAttr layout = type.getLayoutAttr(); + + if (layout) { + if (layout.isWgLayout()) + return failure(); + + if (DenseI32ArrayAttr instData = layout.getInstData()) { + // for unrolling, the subShape is determined by inst_data + subShape = llvm::to_vector_of(instData.asArrayRef()); + count = computeProduct(shape) / computeProduct(subShape); + layout = layout.dropInstData(); + } + } + auto newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, layout); + result.append(count, newTy); + return success(); + }); + xegpu::doSCFStructuralTypeConversionWithTensorType(mod, converter); xegpu::UnrollOptions options; options.setFilterConstraint([&](Operation *op) -> LogicalResult { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 14b2b909e143a..ed7d2eeb6807b 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -225,7 +225,7 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, return result; } -void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { +void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter) { MLIRContext *context = op->getContext(); auto materializeCast = [&](OpBuilder &builder, Type type, ValueRange inputs, @@ -307,109 +307,11 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { { // perform the conversion from RankedTensorType to VectorType based on the // LayoutAttr - auto computeTileShapeAndCount = [&](ArrayRef shape, - DenseI32ArrayAttr sgDataAttr, - DenseI32ArrayAttr sgLayoutAttr) { - SmallVector tileShape; - auto sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); - if (sgDataAttr) - tileShape = llvm::to_vector_of(sgDataAttr.asArrayRef()); - else - tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape); - assert(tileShape.size() && "failed to compute tileShape"); - SmallVector distUnit = - computeElementwiseMul(sgLayout, tileShape); - int count = computeProduct(shape) / computeProduct(distUnit); - return std::make_pair(tileShape, count); - }; - - TypeConverter converter; - converter.addConversion([&](Type type) -> Type { return type; }); - converter.addConversion( - [&](RankedTensorType type, - SmallVectorImpl &result) -> std::optional { - ArrayRef shape = type.getShape(); - auto encoding = type.getEncoding(); - Type elemTy = type.getElementType(); - - // init count and subShape to the default value. If the LayoutAttr - // is not present, it will return a VectorType with original shape. - int count = 1; - SmallVector subShape(shape); - - if (auto layout = - llvm::dyn_cast_if_present(encoding)) { - if (layout.isWgLayout()) { - // for WgToSg, the subShape is either from sgData or computed as - // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount( - shape, layout.getSgData(), layout.getSgLayout()); - } else if (DenseI32ArrayAttr instData = layout.getInstData()) { - // for unrolling, the subShape is determined by inst_data - subShape = llvm::to_vector_of(instData.asArrayRef()); - count = computeProduct(shape) / computeProduct(subShape); - } - } - auto newTy = VectorType::get(subShape, elemTy); - result.append(count, newTy); - return success(); - }); - - converter.addConversion( - [&](xegpu::TensorDescType type, - SmallVectorImpl &result) -> std::optional { - MLIRContext *ctx = type.getContext(); - Type elemTy = type.getElementType(); - Attribute encoding = type.getEncoding(); - ArrayRef shape = type.getShape(); - - // init count and newTy to the default value. If the layout attribute - // is not present, it will return the original type. - int count = 1; - Type newTy = type; - - if (xegpu::LayoutAttr layout = type.getLayoutAttr()) { - SmallVector subShape(shape); - if (layout.isWgLayout()) { - // for WgToSg, the subShape is either from sgData or computed as - // shape/sgLayout - std::tie(subShape, count) = computeTileShapeAndCount( - shape, layout.getSgData(), layout.getSgLayout()); - layout = layout.dropSgLayoutAndData(); - } else if (DenseI32ArrayAttr instData = layout.getInstData()) { - // for unrolling, the subShape is determined by inst_data - subShape = llvm::to_vector_of(instData.asArrayRef()); - count = computeProduct(shape) / computeProduct(subShape); - layout = layout.dropInstData(); - } - - newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, - layout); - } - - result.append(count, newTy); - return success(); - }); - - converter.addSourceMaterialization(materializeCast); - converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, - ValueRange inputs, Location loc) { - return builder.create(loc, type, inputs) - .getResults(); - }); - - mlir::ConversionTarget target(*context); - target.addDynamicallyLegalOp( - [&](UnrealizedConversionCastOp op) { - auto isTensorTy = [&](Type type) { - return isa(type); - }; - if (llvm::any_of(op->getOperandTypes(), isTensorTy) || - llvm::any_of(op->getResultTypes(), isTensorTy)) - return false; - return true; - }); + // Handle the UnrealizedConversionCastOp introduced by the first step. + // For vector->RankedTensorType, it will simply forward the inputs. + // For RankedTensorType->vector, it will update the inputs with the + // one from the adaptor. class UnrealizedConversionCastOpPattern : public OpConversionPattern { using OpConversionPattern< @@ -444,6 +346,24 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) { } }; + converter.addSourceMaterialization(materializeCast); + converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, + ValueRange inputs, Location loc) { + return builder.create(loc, type, inputs) + .getResults(); + }); + + mlir::ConversionTarget target(*context); + target.addDynamicallyLegalOp( + [&](UnrealizedConversionCastOp op) { + auto isTensorTy = [&](Type type) { + return isa(type); + }; + if (llvm::any_of(op->getOperandTypes(), isTensorTy) || + llvm::any_of(op->getResultTypes(), isTensorTy)) + return false; + return true; + }); mlir::RewritePatternSet patterns(context); patterns.insert(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, From ebd78aedf4859179b417056a0c7f9bfcf5ab2968 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 23:27:56 +0000 Subject: [PATCH 15/41] fix naming issue --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index d0adb860abca7..4b6a03c8716c0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -1,4 +1,4 @@ -//===---- XeGPUBlocking.cpp ---- XeGPU Instructionlize Pass ---------------===// +//===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -242,8 +242,8 @@ void XeGPUBlockingPass::runOnOperation() { Attribute encoding = type.getEncoding(); ArrayRef shape = type.getShape(); - // init count and newTy to the default value. If the layout attribute - // is not present, it will return the original type. + // init count and newTy to the default value. If the layout + // attribute is not present, it will return the original type. int count = 1; SmallVector subShape(shape); From bbf4796df3f0e80dbaeeac380ab998bbb5cdf76e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 16 May 2025 23:28:33 +0000 Subject: [PATCH 16/41] fix format --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 14 ++++++++------ .../lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 6 ++++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 3 ++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 44faef00a739e..b8e5fe5cbde32 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -97,12 +97,14 @@ Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef shape); -/// Do type conversion for SCF structural ops, e.g., scf.for using SCF structure type -/// convertion patterns. Since VectorType cannot carry the layout attribute, which is -/// needed to guide the type conversion for XeGPU, they are first converted into -/// RankedTensorType, where the layout attribute can be attached. And then upstream -/// SCF structural type conversion patterns are applied with the provided converter. -void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter); +/// Do type conversion for SCF structural ops, e.g., scf.for using SCF structure +/// type convertion patterns. Since VectorType cannot carry the layout +/// attribute, which is needed to guide the type conversion for XeGPU, they are +/// first converted into RankedTensorType, where the layout attribute can be +/// attached. And then upstream SCF structural type conversion patterns are +/// applied with the provided converter. +void doSCFStructuralTypeConversionWithTensorType(Operation *op, + TypeConverter converter); } // namespace xegpu diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 4b6a03c8716c0..19ff4bf992b07 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -220,7 +220,8 @@ void XeGPUBlockingPass::runOnOperation() { // is not present, it will return a VectorType with original shape. int count = 1; SmallVector subShape(shape); - if (auto layout = llvm::dyn_cast_if_present(type.getEncoding())) { + if (auto layout = llvm::dyn_cast_if_present( + type.getEncoding())) { if (layout.isWgLayout()) return failure(); if (DenseI32ArrayAttr instData = layout.getInstData()) { @@ -260,7 +261,8 @@ void XeGPUBlockingPass::runOnOperation() { layout = layout.dropInstData(); } } - auto newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, layout); + auto newTy = + xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, layout); result.append(count, newTy); return success(); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index ed7d2eeb6807b..5e0e83ef2eed5 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -225,7 +225,8 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, return result; } -void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter) { +void xegpu::doSCFStructuralTypeConversionWithTensorType( + Operation *op, TypeConverter converter) { MLIRContext *context = op->getContext(); auto materializeCast = [&](OpBuilder &builder, Type type, ValueRange inputs, From 3807eeaf672c17b77b2b2fe8733709aab3f52842 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 19 May 2025 16:06:03 +0000 Subject: [PATCH 17/41] fix overflow --- mlir/lib/Dialect/Utils/IndexingUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp index d9edabef6693d..8de77e2c3cb08 100644 --- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp +++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp @@ -24,7 +24,7 @@ SmallVector computeSuffixProductImpl(ArrayRef sizes, if (sizes.empty()) return {}; SmallVector strides(sizes.size(), unit); - for (int64_t r = strides.size() - 2; r >= 0; --r) + for (int64_t r = static_cast(strides.size()) - 2; r >= 0; --r) strides[r] = strides[r + 1] * sizes[r + 1]; return strides; } From c6695d99ab557c97269406ffe0a77d0feeb99b2b Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 19 May 2025 21:15:56 +0000 Subject: [PATCH 18/41] add comments --- mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 2 +- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 2 ++ mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index b3883605b74f2..7baa880c6ff08 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -39,7 +39,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { } def XeGPUBlocking: Pass<"xegpu-blocking"> { - let summary = "Instructionlize XeGPU ops"; + let summary = "Block XeGPU ops into smaller size."; let description = [{ The pass unrolls XeGPU ops working on large shapes into ops working on small shapes (given by the inst_data in the layout attr), such that each of them can be dispatch diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index b8e5fe5cbde32..4077de593b109 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -103,6 +103,8 @@ Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc, /// first converted into RankedTensorType, where the layout attribute can be /// attached. And then upstream SCF structural type conversion patterns are /// applied with the provided converter. +/// TODO: This is a temporary solution. We should refactor it when context-aware +/// type conversion is available. void doSCFStructuralTypeConversionWithTensorType(Operation *op, TypeConverter converter); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 19ff4bf992b07..778ab0476b312 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -33,7 +33,12 @@ using namespace mlir; namespace { -void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { +// reslove the unrealized conversion cast ops generated when doing SCF +// Structural Type Conversion. It will have two formats, N:1 vector +// cast and 1:N vector cast. vector::insert_strided_slice ops will be +// used for the first case, and vector::extract_strided_slice ops will be +// used for the second case. +static void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { ValueRange inputs = castOp.getInputs(); ValueRange outputs = castOp.getOutputs(); From 50e33ff069acc9e706f51ed814e1bc9961161f75 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 20 May 2025 14:19:55 +0000 Subject: [PATCH 19/41] add dbg log --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 778ab0476b312..6ac66ce7e6988 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -28,6 +28,7 @@ namespace xegpu { #define DEBUG_TYPE "xegpu-blocking" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") using namespace mlir; @@ -121,6 +122,7 @@ XeGPUBlockingPass::getTileShape(OpOperand &operand) const { if (auto type = dyn_cast(operand.get().getType())) return llvm::to_vector(type.getShape()); } + LDBG("failed to getTileShape for operand: " << operand.get()); return std::nullopt; } @@ -134,6 +136,7 @@ XeGPUBlockingPass::getTileShape(OpResult result) const { if (auto type = dyn_cast(result.getType())) return llvm::to_vector(type.getShape()); } + LDBG("failed to getTileShape for result: " << result); return std::nullopt; } From ae22f2796b3da2267c1be06a9fdffc7466c92027 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 20 May 2025 14:20:29 +0000 Subject: [PATCH 20/41] fix format --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 6ac66ce7e6988..5bde40449b926 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -39,7 +39,8 @@ namespace { // cast and 1:N vector cast. vector::insert_strided_slice ops will be // used for the first case, and vector::extract_strided_slice ops will be // used for the second case. -static void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { +static void +resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { ValueRange inputs = castOp.getInputs(); ValueRange outputs = castOp.getOutputs(); From 977685060a9b2ca8df3b648c49ce946609e571d8 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 20 May 2025 14:29:13 +0000 Subject: [PATCH 21/41] cleanup --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 5bde40449b926..b4ff5856b0b6c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -188,20 +188,20 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { for (auto &opr : op->getOpOperands()) { std::optional> tileShape = getTileShape(opr); auto shapedType = dyn_cast(opr.get().getType()); - if (!shapedType) + if (!shapedType || !tileShape) continue; - if (tileShape && !llvm::equal(*tileShape, shapedType.getShape())) + if (!llvm::equal(*tileShape, shapedType.getShape())) return true; } for (auto result : op->getOpResults()) { std::optional> tileShape = getTileShape(result); auto shapedType = dyn_cast(result.getType()); - if (!shapedType) + if (!shapedType || !tileShape) continue; - if (tileShape && !llvm::equal(*tileShape, shapedType.getShape())) + if (!llvm::equal(*tileShape, shapedType.getShape())) return true; } return false; From 6cffa443d1c11197106d076e21da9fa973592fe8 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 20 May 2025 15:42:06 +0000 Subject: [PATCH 22/41] refactor --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 67 +++++++++---------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index b4ff5856b0b6c..9c839f0c056f8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -216,6 +216,18 @@ void XeGPUBlockingPass::runOnOperation() { // operation is replaced. xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); }); + auto getTileShapeAndCount = [](llvm::ArrayRef shape, + xegpu::LayoutAttr layout) { + int count = 1; + SmallVector tileShape(shape); + if (layout && layout.getInstData()) { + DenseI32ArrayAttr instData = layout.getInstData(); + tileShape = llvm::to_vector_of(instData.asArrayRef()); + count = computeProduct(shape) / computeProduct(tileShape); + } + return std::make_pair(tileShape, count); + }; + // Perform type conversion for SCF control folow ops TypeConverter converter; converter.addConversion([&](Type type) -> Type { return type; }); @@ -225,56 +237,41 @@ void XeGPUBlockingPass::runOnOperation() { Type elemTy = type.getElementType(); ArrayRef shape = type.getShape(); - // init count and subShape to the default value. If the LayoutAttr - // is not present, it will return a VectorType with original shape. - int count = 1; - SmallVector subShape(shape); - if (auto layout = llvm::dyn_cast_if_present( - type.getEncoding())) { - if (layout.isWgLayout()) - return failure(); - if (DenseI32ArrayAttr instData = layout.getInstData()) { - // for unrolling, the subShape is determined by inst_data - subShape = llvm::to_vector_of(instData.asArrayRef()); - count = computeProduct(shape) / computeProduct(subShape); - } - } + auto layout = + llvm::dyn_cast_if_present(type.getEncoding()); + if (layout && layout.isWgLayout()) + return failure(); + + int count; + SmallVector subShape; + std::tie(subShape, count) = getTileShapeAndCount(shape, layout); auto newTy = VectorType::get(subShape, elemTy); result.append(count, newTy); return success(); }); - converter.addConversion( [&](xegpu::TensorDescType type, SmallVectorImpl &result) -> std::optional { - MLIRContext *ctx = type.getContext(); Type elemTy = type.getElementType(); - Attribute encoding = type.getEncoding(); ArrayRef shape = type.getShape(); - // init count and newTy to the default value. If the layout - // attribute is not present, it will return the original type. - int count = 1; - SmallVector subShape(shape); - xegpu::LayoutAttr layout = type.getLayoutAttr(); + if (layout && layout.isWgLayout()) + return failure(); + + int count; + SmallVector subShape; + std::tie(subShape, count) = getTileShapeAndCount(shape, layout); - if (layout) { - if (layout.isWgLayout()) - return failure(); - - if (DenseI32ArrayAttr instData = layout.getInstData()) { - // for unrolling, the subShape is determined by inst_data - subShape = llvm::to_vector_of(instData.asArrayRef()); - count = computeProduct(shape) / computeProduct(subShape); - layout = layout.dropInstData(); - } - } - auto newTy = - xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding, layout); + if (layout) + layout = layout.dropInstData(); + + auto newTy = xegpu::TensorDescType::get( + type.getContext(), subShape, elemTy, type.getEncoding(), layout); result.append(count, newTy); return success(); }); + xegpu::doSCFStructuralTypeConversionWithTensorType(mod, converter); xegpu::UnrollOptions options; From e023c1a235a7a452570b2cdb2ccb6851df2c9b7d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 22 May 2025 17:52:06 +0000 Subject: [PATCH 23/41] add a corner unit test --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 40 ++++++++++++----- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 14 +++--- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 43 +++++++++++++++++++ 3 files changed, 78 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 9c839f0c056f8..f8b5d4a9caaf9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -185,24 +185,44 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { if (isa(op)) return false; - for (auto &opr : op->getOpOperands()) { + auto isUnrollable = [&](Value value, + ArrayRef tileShape) -> std::optional { + Type valTy = value.getType(); + if (auto tdesc = dyn_cast(valTy)) { + xegpu::LayoutAttr layout = tdesc.getLayoutAttr(); + if (!layout) + return std::nullopt; + if (layout.isWgLayout()) + return false; + if (layout.getInstData()) + return true; + } + + auto shapedType = dyn_cast(valTy); + if (shapedType && !llvm::equal(tileShape, shapedType.getShape())) + return true; + + return std::nullopt; + }; + + for (OpOperand &opr : op->getOpOperands()) { std::optional> tileShape = getTileShape(opr); - auto shapedType = dyn_cast(opr.get().getType()); - if (!shapedType || !tileShape) + if (!tileShape) continue; - if (!llvm::equal(*tileShape, shapedType.getShape())) - return true; + std::optional unrollable = isUnrollable(opr.get(), *tileShape); + if (unrollable.has_value()) + return unrollable.value(); } - for (auto result : op->getOpResults()) { + for (OpResult result : op->getOpResults()) { std::optional> tileShape = getTileShape(result); - auto shapedType = dyn_cast(result.getType()); - if (!shapedType || !tileShape) + if (!tileShape) continue; - if (!llvm::equal(*tileShape, shapedType.getShape())) - return true; + std::optional unrollable = isUnrollable(result, *tileShape); + if (unrollable.has_value()) + return unrollable.value(); } return false; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index d9f69158f95eb..885477fe4cbd5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -136,7 +136,7 @@ struct UnrollCreateNdOp : public UnrollPattern { ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); - if (!targetShape || llvm::equal(*targetShape, shape)) + if (!targetShape) return failure(); auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0]; @@ -187,10 +187,9 @@ struct UnrollUpdateNdOffsetOp : public UnrollPattern { PatternRewriter &rewriter) const override { Location loc = op.getLoc(); xegpu::TensorDescType tdescTy = op.getTensorDescType(); - ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); - if (!targetShape || llvm::equal(*targetShape, shape)) + if (!targetShape) return failure(); SmallVector convertedTdescTypes = @@ -216,10 +215,9 @@ struct UnrollPrefetchNdOp : public UnrollPattern { PatternRewriter &rewriter) const override { Location loc = op.getLoc(); xegpu::TensorDescType tdescTy = op.getTensorDescType(); - ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); - if (!targetShape || llvm::equal(*targetShape, shape)) + if (!targetShape) return failure(); SmallVector convertedTdescTypes = @@ -243,10 +241,9 @@ struct UnrollLoadNdOp : public UnrollPattern { Location loc = op.getLoc(); VectorType valueTy = op.getType(); xegpu::TensorDescType tdescTy = op.getTensorDescType(); - ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); - if (!targetShape || llvm::equal(*targetShape, shape)) + if (!targetShape) return failure(); Type elemTy = tdescTy.getElementType(); @@ -278,10 +275,9 @@ struct UnrollStoreNdOp : public UnrollPattern { Location loc = op.getLoc(); VectorType valueTy = op.getValueType(); xegpu::TensorDescType tdescTy = op.getTensorDescType(); - ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); - if (!targetShape || llvm::equal(*targetShape, shape)) + if (!targetShape) return failure(); SmallVector convertedValTypes = diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d8a5dfe7d4b13..c9866b94dc79e 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -82,6 +82,49 @@ gpu.module @test_kernel { } } +// ----- +#l1 = #xegpu.layout +#l2 = #xegpu.layout +gpu.module @test_kernel { + gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %m = arith.muli %block_id_x, %c8 : index + %n = arith.muli %block_id_y, %c32 : index + + %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1> + + //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> + + %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1> + %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2> + %out:3 = scf.for %k = %c0 to %c1024 step %c16 + iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) + -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) { + //CHECK: %22 = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> + //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> + %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> + //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16> + %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1> + //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16> + %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<16x32xf16, #l2> + scf.yield %a_next_tdesc, %b_next_tdesc, %c + : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32> + } + //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %out#2, %c_tdesc: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> + gpu.return + } +} + // ----- #a = #xegpu.layout #b = #xegpu.layout From 39678106fd4ed4f8f79c23c05dbd4b29b275f66e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 23 May 2025 20:34:27 +0000 Subject: [PATCH 24/41] fix comments --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 25 +++++-------------- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 12 ++++----- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index f8b5d4a9caaf9..fcf9a09a8ffc0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -80,15 +80,14 @@ class XeGPUBlockingPass final void runOnOperation() override; private: - // Get the tile shape for a given value. If the value has a layout - // attribute and it is an SG layout, return the inst_data as the tile shape - // if inst_data is available; otherwise, return the original shape of the - // value. If the value does not have an SG layout, return std::nullopt. - std::optional> - getTileShape(TypedValue value) const; - + // Get the tile shape for a given operand by examining the layout attribute. + // If layout is not present or is not a subgroup level layout, it returns + // std::nullopt. std::optional> getTileShape(OpOperand &operand) const; + // Get the tile shape for a given result by examining the layout attribute. + // If layout is not present or is not a subgroup level layout, it returns + // std::nullopt. std::optional> getTileShape(OpResult result) const; // Get the tile shape for a given operation. @@ -101,18 +100,6 @@ class XeGPUBlockingPass final }; } // namespace -std::optional> -XeGPUBlockingPass::getTileShape(TypedValue value) const { - assert(value && "value must be non-null"); - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value); - if (layout && layout.isSgLayout()) { - if (auto inst_data = layout.getInstData()) - return llvm::to_vector_of(inst_data.asArrayRef()); - return llvm::to_vector(value.getType().getShape()); - } - return std::nullopt; -} - std::optional> XeGPUBlockingPass::getTileShape(OpOperand &operand) const { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index c9866b94dc79e..4fe3844dc1c39 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -4,7 +4,7 @@ #b = #xegpu.layout #c = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @test_gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -45,7 +45,7 @@ gpu.module @test_kernel { #l1 = #xegpu.layout #l2 = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @test_gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -86,7 +86,7 @@ gpu.module @test_kernel { #l1 = #xegpu.layout #l2 = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @test_gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index @@ -130,7 +130,7 @@ gpu.module @test_kernel { #b = #xegpu.layout #c = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { + gpu.func @test_gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index @@ -172,7 +172,7 @@ gpu.module @test_kernel { // ----- #l = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_elementwise(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + gpu.func @test_elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %c1024 = arith.constant 1024 : index @@ -211,7 +211,7 @@ gpu.module @test_kernel { // ----- #l = #xegpu.layout gpu.module @test_kernel { - gpu.func @test_elementwise(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { + gpu.func @test_elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %c1024 = arith.constant 1024 : index From aebc327a494876e57219e236bd040b55b8d4bc76 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 14:41:49 +0000 Subject: [PATCH 25/41] remove unnecessary reference for lambda --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index fcf9a09a8ffc0..fefcaf7e73d41 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -172,8 +172,8 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { if (isa(op)) return false; - auto isUnrollable = [&](Value value, - ArrayRef tileShape) -> std::optional { + auto isUnrollable = [](Value value, + ArrayRef tileShape) -> std::optional { Type valTy = value.getType(); if (auto tdesc = dyn_cast(valTy)) { xegpu::LayoutAttr layout = tdesc.getLayoutAttr(); @@ -221,7 +221,7 @@ void XeGPUBlockingPass::runOnOperation() { // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. - xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); }); + xegpu::setLayoutAttrs(mod, [](Value v) { return xegpu::getLayoutAttr(v); }); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -237,7 +237,7 @@ void XeGPUBlockingPass::runOnOperation() { // Perform type conversion for SCF control folow ops TypeConverter converter; - converter.addConversion([&](Type type) -> Type { return type; }); + converter.addConversion([](Type type) -> Type { return type; }); converter.addConversion( [&](RankedTensorType type, SmallVectorImpl &result) -> std::optional { @@ -283,7 +283,7 @@ void XeGPUBlockingPass::runOnOperation() { xegpu::UnrollOptions options; options.setFilterConstraint([&](Operation *op) -> LogicalResult { - return needsUnroll(op) ? success() : failure(); + return success(needsUnroll(op)); }); options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); @@ -315,7 +315,7 @@ void XeGPUBlockingPass::runOnOperation() { (void)applyPatternsGreedily(mod, std::move(patterns)); - mod->walk([&](Operation *op) { + mod->walk([](Operation *op) { if (auto castOp = dyn_cast(op)) resolveUnrealizedConversionCastOp(castOp); From 90e7563a2b7e09b3cc506946cc8afa960316606e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 14:45:45 +0000 Subject: [PATCH 26/41] rename --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index fefcaf7e73d41..1473ccf6feeae 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -216,12 +216,12 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { void XeGPUBlockingPass::runOnOperation() { MLIRContext *ctx = &getContext(); - Operation *mod = getOperation(); + Operation *op = getOperation(); // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. - xegpu::setLayoutAttrs(mod, [](Value v) { return xegpu::getLayoutAttr(v); }); + xegpu::setLayoutAttrs(op, [](Value v) { return xegpu::getLayoutAttr(v); }); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -279,7 +279,7 @@ void XeGPUBlockingPass::runOnOperation() { return success(); }); - xegpu::doSCFStructuralTypeConversionWithTensorType(mod, converter); + xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter); xegpu::UnrollOptions options; options.setFilterConstraint([&](Operation *op) -> LogicalResult { @@ -313,9 +313,9 @@ void XeGPUBlockingPass::runOnOperation() { populateXeGPUUnrollPatterns(patterns, options); vector::populateVectorUnrollPatterns(patterns, vectorOptions); - (void)applyPatternsGreedily(mod, std::move(patterns)); + (void)applyPatternsGreedily(op, std::move(patterns)); - mod->walk([](Operation *op) { + op->walk([](Operation *op) { if (auto castOp = dyn_cast(op)) resolveUnrealizedConversionCastOp(castOp); From f5bfc2f8f22e93c0168ffc4b72152bf9f88d9084 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 15:18:20 +0000 Subject: [PATCH 27/41] address comments --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 5 +---- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 1473ccf6feeae..1d034e5685ed3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -60,10 +60,7 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { builder, castOp.getLoc(), inputs, shape); castOp->replaceAllUsesWith(ValueRange(result)); castOp->erase(); - } - - // pack - if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { + } else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { ArrayRef tileShape = outputTy.getShape(); SmallVector results = xegpu::extractVectorsWithShapeFromValue( builder, castOp.getLoc(), inputs[0], tileShape); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 5e0e83ef2eed5..d8b3906468ea8 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -360,10 +360,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( auto isTensorTy = [&](Type type) { return isa(type); }; - if (llvm::any_of(op->getOperandTypes(), isTensorTy) || - llvm::any_of(op->getResultTypes(), isTensorTy)) - return false; - return true; + return llvm::none_of(op->getOperandTypes(), isTensorTy) && + llvm::none_of(op->getResultTypes(), isTensorTy); }); mlir::RewritePatternSet patterns(context); patterns.insert(context); From 598fbcede72a9269cd14e4241ab6da9eb829edbe Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 15:18:43 +0000 Subject: [PATCH 28/41] fix format --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 1d034e5685ed3..2ad757d7ed25d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -279,9 +279,8 @@ void XeGPUBlockingPass::runOnOperation() { xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter); xegpu::UnrollOptions options; - options.setFilterConstraint([&](Operation *op) -> LogicalResult { - return success(needsUnroll(op)); - }); + options.setFilterConstraint( + [&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); }); options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); From ff11a0572326b85208acd04809651d1631a0e74e Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 15:59:54 +0000 Subject: [PATCH 29/41] add comments --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 3f5fe2cce4636..84c1dc1373ee5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -295,6 +295,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> { } LayoutAttr dropSgLayoutAndData() { + // avoid every field of the attribute is nullptr, which may lead to segment fault if (!getInstData() && !getLaneLayout()) return nullptr; return LayoutAttr::get(getContext(), nullptr, nullptr, getInstData(), @@ -302,6 +303,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> { } LayoutAttr dropInstData() { + // avoid every field of the attribute is nullptr, which may lead to segment fault if (!getSgLayout() && !getLaneLayout()) return nullptr; return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr, From 9f7f715a19eee82028121ad1b8f234104950c5f7 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 16:31:41 +0000 Subject: [PATCH 30/41] add comments --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 52 ++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 2ad757d7ed25d..7e627bfc81ac3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -43,30 +43,44 @@ static void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { ValueRange inputs = castOp.getInputs(); ValueRange outputs = castOp.getOutputs(); - - if (inputs.size() == 1 && outputs.size() == 1) { - castOp->replaceAllUsesWith(inputs); + if (inputs.empty() || outputs.empty()) { + LDBG("erase unrealized conversion cast op has no inputs/outputs."); castOp->erase(); + return; } VectorType inputTy = dyn_cast(inputs[0].getType()); VectorType outputTy = dyn_cast(outputs[0].getType()); - if (inputTy && outputTy) { - OpBuilder builder(castOp); - // unpack - if (inputs.size() > 1 && outputs.size() == 1) { - ArrayRef shape = outputTy.getShape(); - Value result = xegpu::createVectorWithShapeFromValues( - builder, castOp.getLoc(), inputs, shape); - castOp->replaceAllUsesWith(ValueRange(result)); - castOp->erase(); - } else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { - ArrayRef tileShape = outputTy.getShape(); - SmallVector results = xegpu::extractVectorsWithShapeFromValue( - builder, castOp.getLoc(), inputs[0], tileShape); - castOp->replaceAllUsesWith(results); - castOp->erase(); - } + if (!inputTy || !outputTy) { + LDBG("skip unrealized conversion cast op has non-vector inputs/outputs."); + return; + } + + // We only interest in the case where all inputs and outputs have the + // identical types + if (llvm::any_of(castOp->getOperandTypes(), + [&](Type t) { return t != inputTy; }) || + llvm::any_of(castOp->getResultTypes(), + [&](Type t) { return t != outputTy; })) { + LDBG("skip unrealized conversion cast op not emulating pack/unpack."); + return; + } + + OpBuilder builder(castOp); + if (inputs.size() > 1 && outputs.size() == 1) { + // the castOp is emulating an unpack op + ArrayRef shape = outputTy.getShape(); + Value result = xegpu::createVectorWithShapeFromValues( + builder, castOp.getLoc(), inputs, shape); + castOp->replaceAllUsesWith(ValueRange(result)); + castOp->erase(); + } else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) { + // the castOp is emulating a pack op + ArrayRef tileShape = outputTy.getShape(); + SmallVector results = xegpu::extractVectorsWithShapeFromValue( + builder, castOp.getLoc(), inputs[0], tileShape); + castOp->replaceAllUsesWith(results); + castOp->erase(); } } From b164d7b4d4224c4c53d6e9fa34bb238251172dbc Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 16:57:59 +0000 Subject: [PATCH 31/41] address comments --- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index d8b3906468ea8..7cede355b7561 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -165,17 +165,17 @@ void xegpu::setLayoutAttr(OpResult result, LayoutAttr layout) { owner->setAttr(name, layout); } -void xegpu::setLayoutAttrs(Operation *mod, +void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { - mod->walk([&](Operation *op) { - for (OpResult result : op->getOpResults()) { - auto layout = getLayoutImpl(result); - setLayoutAttr(result, layout); - } - for (OpOperand &opr : op->getOpOperands()) { + op->walk([&](Operation *nestOp) { + for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); setLayoutAttr(opr, layout); } + for (OpResult result : nestOp->getOpResults()) { + auto layout = getLayoutImpl(result); + setLayoutAttr(result, layout); + } }); } From 554f4b414b3b29d9b4befd4beeee39f5a275e128 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 18:17:59 +0000 Subject: [PATCH 32/41] refactor --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 64 ++++++++----------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 7e627bfc81ac3..50f056dafe0d9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -180,49 +180,41 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { } bool XeGPUBlockingPass::needsUnroll(Operation *op) const { - if (isa(op)) + // skip the op if any of its operands or results has workgroup level layouts + bool hasWgLayoutOperands = + llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr); + return layout && layout.isWgLayout(); + }); + bool hasWgLayoutResults = + llvm::any_of(op->getOpResults(), [](OpResult result) { + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); + return layout && layout.isWgLayout(); + }); + if (hasWgLayoutOperands || hasWgLayoutResults) return false; - auto isUnrollable = [](Value value, - ArrayRef tileShape) -> std::optional { + auto isUnrollable = [](Value value, ArrayRef tileShape) { Type valTy = value.getType(); - if (auto tdesc = dyn_cast(valTy)) { - xegpu::LayoutAttr layout = tdesc.getLayoutAttr(); - if (!layout) - return std::nullopt; - if (layout.isWgLayout()) - return false; - if (layout.getInstData()) - return true; + if (auto tdescTy = dyn_cast(valTy)) { + xegpu::LayoutAttr layout = tdescTy.getLayoutAttr(); + return layout && layout.getInstData(); } - auto shapedType = dyn_cast(valTy); - if (shapedType && !llvm::equal(tileShape, shapedType.getShape())) - return true; - - return std::nullopt; + return shapedType && !llvm::equal(tileShape, shapedType.getShape()); }; - for (OpOperand &opr : op->getOpOperands()) { - std::optional> tileShape = getTileShape(opr); - if (!tileShape) - continue; - - std::optional unrollable = isUnrollable(opr.get(), *tileShape); - if (unrollable.has_value()) - return unrollable.value(); - } - - for (OpResult result : op->getOpResults()) { - std::optional> tileShape = getTileShape(result); - if (!tileShape) - continue; - - std::optional unrollable = isUnrollable(result, *tileShape); - if (unrollable.has_value()) - return unrollable.value(); - } - return false; + bool hasUnrollableOperands = + llvm::any_of(op->getOpOperands(), [&](OpOperand &opr) { + std::optional> tileShape = getTileShape(opr); + return tileShape.has_value() && isUnrollable(opr.get(), *tileShape); + }); + bool hasUnrollableResults = + llvm::any_of(op->getOpResults(), [&](OpResult result) { + std::optional> tileShape = getTileShape(result); + return tileShape.has_value() && isUnrollable(result, *tileShape); + }); + return hasUnrollableOperands || hasUnrollableResults; } void XeGPUBlockingPass::runOnOperation() { From d9f2e813c722b4ec56cfe9137e6e218dc2e42d8d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 19:54:09 +0000 Subject: [PATCH 33/41] refactor getTileShape with template --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +-- .../XeGPU/Transforms/XeGPUBlocking.cpp | 46 ++++++++----------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 9 ++-- 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 4077de593b109..a58d0122d0421 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -57,10 +57,10 @@ FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); /// Return the attribute name for the OpOperand to attach LayoutAttr -std::string getLayoutName(OpOperand &opr); +std::string getLayoutName(const OpOperand &opr); /// Return the attribute name for the OpResult to attach LayoutAttr -std::string getLayoutName(OpResult res); +std::string getLayoutName(const OpResult res); /// Retrieves the LayoutAttr associated with a given Value. For TensorDescType /// values, the LayoutAttr is extracted from the TensorDescType itself. For @@ -71,7 +71,7 @@ LayoutAttr getLayoutAttr(Value value); /// Retrieves the LayoutAttr associated with a given OpOperand. It will /// first check the operand_layout_{id} of the owner operation. If not found, /// it will check the operand itself and its defining op. -LayoutAttr getLayoutAttr(OpOperand &opr); +LayoutAttr getLayoutAttr(const OpOperand &opr); /// Sets the LayoutAttr for a given OpOperand by attaching it to the owner void setLayoutAttr(OpOperand &opr, LayoutAttr layout); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 50f056dafe0d9..022bf14492588 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -91,15 +91,14 @@ class XeGPUBlockingPass final void runOnOperation() override; private: - // Get the tile shape for a given operand by examining the layout attribute. - // If layout is not present or is not a subgroup level layout, it returns - // std::nullopt. - std::optional> getTileShape(OpOperand &operand) const; - - // Get the tile shape for a given result by examining the layout attribute. - // If layout is not present or is not a subgroup level layout, it returns - // std::nullopt. - std::optional> getTileShape(OpResult result) const; + // Get the tile shape for a given OpOperand or OpResult by examining the + // corresponding layout attribute. If layout is not present or is not a + // subgroup level layout, it returns std::nullopt. + template || + std::is_same_v>> + std::optional> + getTileShape(const T &operandOrResult) const; // Get the tile shape for a given operation. std::optional> getTileShape(Operation *op) const; @@ -111,31 +110,24 @@ class XeGPUBlockingPass final }; } // namespace +template std::optional> -XeGPUBlockingPass::getTileShape(OpOperand &operand) const { - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); - if (layout && layout.isSgLayout()) { - if (auto inst_data = layout.getInstData()) - return llvm::to_vector_of(inst_data.asArrayRef()); - - if (auto type = dyn_cast(operand.get().getType())) - return llvm::to_vector(type.getShape()); - } - LDBG("failed to getTileShape for operand: " << operand.get()); - return std::nullopt; -} - -std::optional> -XeGPUBlockingPass::getTileShape(OpResult result) const { - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); +XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { + Value value; + if constexpr (std::is_same_v) + value = operandOrResult.get(); + else + value = (Value)operandOrResult; + + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operandOrResult); if (layout && layout.isSgLayout()) { if (auto inst_data = layout.getInstData()) return llvm::to_vector_of(inst_data.asArrayRef()); - if (auto type = dyn_cast(result.getType())) + if (auto type = dyn_cast(value.getType())) return llvm::to_vector(type.getShape()); } - LDBG("failed to getTileShape for result: " << result); + LDBG("failed to getTileShape for: " << value); return std::nullopt; } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 7cede355b7561..39c274850c7cc 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,12 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getLayoutName(OpOperand &opr) { +std::string xegpu::getLayoutName(const OpOperand &opr) { const StringRef prefix("layout_operand_"); - return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str(); + unsigned idx = const_cast(opr).getOperandNumber(); + return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getLayoutName(OpResult res) { +std::string xegpu::getLayoutName(const OpResult res) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); } @@ -143,7 +144,7 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { return nullptr; } -xegpu::LayoutAttr xegpu::getLayoutAttr(OpOperand &opr) { +xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) From 18e49f6bbf2e8d6fd0fd0fa4a429998778772d5c Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 20:01:28 +0000 Subject: [PATCH 34/41] add qualifiers --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 4 ++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index a58d0122d0421..942664deba9dd 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -66,7 +66,7 @@ std::string getLayoutName(const OpResult res); /// values, the LayoutAttr is extracted from the TensorDescType itself. For /// other values, it is obtained from the attributes of the defining operation. /// Returns nullptr if no LayoutAttr is found. -LayoutAttr getLayoutAttr(Value value); +LayoutAttr getLayoutAttr(const Value value); /// Retrieves the LayoutAttr associated with a given OpOperand. It will /// first check the operand_layout_{id} of the owner operation. If not found, @@ -74,7 +74,7 @@ LayoutAttr getLayoutAttr(Value value); LayoutAttr getLayoutAttr(const OpOperand &opr); /// Sets the LayoutAttr for a given OpOperand by attaching it to the owner -void setLayoutAttr(OpOperand &opr, LayoutAttr layout); +void setLayoutAttr(const OpOperand &opr, const LayoutAttr layout); /// Set the LayoutAttr for the given OpResult by attching it to the defining op void setLayoutAttr(OpResult result, LayoutAttr layout); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 39c274850c7cc..69d653a4a45bb 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -112,7 +112,7 @@ std::string xegpu::getLayoutName(const OpResult res) { return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); } -xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) { +xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { if (!value) return nullptr; @@ -152,14 +152,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { return getLayoutAttr(opr.get()); } -void xegpu::setLayoutAttr(OpOperand &opr, LayoutAttr layout) { +void xegpu::setLayoutAttr(const OpOperand &opr, const LayoutAttr layout) { auto owner = opr.getOwner(); std::string name = xegpu::getLayoutName(opr); if (layout && !owner->hasAttrOfType(name)) owner->setAttr(name, layout); } -void xegpu::setLayoutAttr(OpResult result, LayoutAttr layout) { +void xegpu::setLayoutAttr(const OpResult result, const LayoutAttr layout) { Operation *owner = result.getOwner(); std::string name = xegpu::getLayoutName(result); if (layout && !owner->hasAttr(name)) From 1f218f49c87e4f83e82580a7918e56904ae96677 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 20:03:04 +0000 Subject: [PATCH 35/41] add qualifiers --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 942664deba9dd..ff9089ad9db18 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -77,7 +77,7 @@ LayoutAttr getLayoutAttr(const OpOperand &opr); void setLayoutAttr(const OpOperand &opr, const LayoutAttr layout); /// Set the LayoutAttr for the given OpResult by attching it to the defining op -void setLayoutAttr(OpResult result, LayoutAttr layout); +void setLayoutAttr(const OpResult result, const LayoutAttr layout); /// Set the LayoutAttr for each OpOperand and OpResult of the given operation. /// If the operation contains regions, it is also applied recursively to the From f869b13f990809d8ba08a956d981c29677ff94f7 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 20:15:38 +0000 Subject: [PATCH 36/41] refactor setLayoutAttrs --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 11 ++++++----- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 14 ++++---------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index ff9089ad9db18..e215a03b6d909 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -73,11 +73,12 @@ LayoutAttr getLayoutAttr(const Value value); /// it will check the operand itself and its defining op. LayoutAttr getLayoutAttr(const OpOperand &opr); -/// Sets the LayoutAttr for a given OpOperand by attaching it to the owner -void setLayoutAttr(const OpOperand &opr, const LayoutAttr layout); - -/// Set the LayoutAttr for the given OpResult by attching it to the defining op -void setLayoutAttr(const OpResult result, const LayoutAttr layout); +/// Sets the LayoutAttr for a given OpOperand or OpResult by attaching +/// it to the owner's dictionary attributes +template || + std::is_same_v>> +void setLayoutAttr(const T &operandOrResult, const LayoutAttr layout); /// Set the LayoutAttr for each OpOperand and OpResult of the given operation. /// If the operation contains regions, it is also applied recursively to the diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 69d653a4a45bb..56b5b6c2a0ac1 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -152,20 +152,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { return getLayoutAttr(opr.get()); } -void xegpu::setLayoutAttr(const OpOperand &opr, const LayoutAttr layout) { - auto owner = opr.getOwner(); - std::string name = xegpu::getLayoutName(opr); +template +void xegpu::setLayoutAttr(const T &operandOrResult, const LayoutAttr layout) { + Operation *owner = operandOrResult.getOwner(); + std::string name = xegpu::getLayoutName(operandOrResult); if (layout && !owner->hasAttrOfType(name)) owner->setAttr(name, layout); } -void xegpu::setLayoutAttr(const OpResult result, const LayoutAttr layout) { - Operation *owner = result.getOwner(); - std::string name = xegpu::getLayoutName(result); - if (layout && !owner->hasAttr(name)) - owner->setAttr(name, layout); -} - void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { From de7585536d58d5b383221e21590fe75d0bdeea5a Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 27 May 2025 20:26:58 +0000 Subject: [PATCH 37/41] cleanup unnecessary reference symbols --- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 56b5b6c2a0ac1..ea01a22aa5473 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -224,16 +224,16 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( Operation *op, TypeConverter converter) { MLIRContext *context = op->getContext(); - auto materializeCast = [&](OpBuilder &builder, Type type, ValueRange inputs, - Location loc) -> Value { + auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs, + Location loc) -> Value { return builder.create(loc, type, inputs) .getResult(0); }; { // convert VectorType to RankedTensorType for SCF Structural ops TypeConverter converter; - converter.addConversion([&](Type type) -> Type { return type; }); - converter.addConversion([&](VectorType type) -> Type { + converter.addConversion([](Type type) -> Type { return type; }); + converter.addConversion([](VectorType type) -> Type { return RankedTensorType::get(type.getShape(), type.getElementType()); }); converter.addSourceMaterialization(materializeCast); @@ -251,7 +251,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( { // propagate the layout attribute to RankedTensorType by checking // BuiltInUnrealizedCastOps // for VectorType to RankedTensorType cast. - op->walk([&](UnrealizedConversionCastOp castOp) { + op->walk([](UnrealizedConversionCastOp castOp) { if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1) return WalkResult::skip(); @@ -289,7 +289,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( }); // using yieldOp as anchor to update the result type of its ParentOp - op->walk([&](scf::YieldOp yieldOp) { + op->walk([](scf::YieldOp yieldOp) { Operation *parentOp = yieldOp->getParentOp(); for (OpResult r : parentOp->getOpResults()) { unsigned idx = r.getResultNumber(); @@ -351,8 +351,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( mlir::ConversionTarget target(*context); target.addDynamicallyLegalOp( - [&](UnrealizedConversionCastOp op) { - auto isTensorTy = [&](Type type) { + [](UnrealizedConversionCastOp op) { + auto isTensorTy = [](Type type) { return isa(type); }; return llvm::none_of(op->getOperandTypes(), isTensorTy) && From beacf8abb64dc353f3c05ffc61233aff233fff9f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 28 May 2025 14:21:03 +0000 Subject: [PATCH 38/41] update naming --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 4 ++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index e215a03b6d909..f9327d63869c0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -57,10 +57,10 @@ FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); /// Return the attribute name for the OpOperand to attach LayoutAttr -std::string getLayoutName(const OpOperand &opr); +std::string getLayoutName(const OpOperand &operand); /// Return the attribute name for the OpResult to attach LayoutAttr -std::string getLayoutName(const OpResult res); +std::string getLayoutName(const OpResult result); /// Retrieves the LayoutAttr associated with a given Value. For TensorDescType /// values, the LayoutAttr is extracted from the TensorDescType itself. For diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index ea01a22aa5473..974aac94f9699 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -101,15 +101,15 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, return xegpu::getDistributedVectorType(helperTdescTy); } -std::string xegpu::getLayoutName(const OpOperand &opr) { +std::string xegpu::getLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); - unsigned idx = const_cast(opr).getOperandNumber(); + unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } -std::string xegpu::getLayoutName(const OpResult res) { +std::string xegpu::getLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; - return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str(); + return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { From c4c7abdd15c949ab044ba5a235f5a344725d73d1 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 28 May 2025 20:38:15 +0000 Subject: [PATCH 39/41] refactor --- .../XeGPU/Transforms/XeGPUBlocking.cpp | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 022bf14492588..fa666d8fa50c0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -18,6 +18,7 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/ADT/STLExtras.h" namespace mlir { namespace xegpu { @@ -43,29 +44,22 @@ static void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { ValueRange inputs = castOp.getInputs(); ValueRange outputs = castOp.getOutputs(); - if (inputs.empty() || outputs.empty()) { - LDBG("erase unrealized conversion cast op has no inputs/outputs."); - castOp->erase(); - return; - } - VectorType inputTy = dyn_cast(inputs[0].getType()); - VectorType outputTy = dyn_cast(outputs[0].getType()); - if (!inputTy || !outputTy) { - LDBG("skip unrealized conversion cast op has non-vector inputs/outputs."); - return; - } + auto hasIdenticalVectorTypes = [](ValueRange values) { + auto types = values.getTypes(); + return llvm::all_of(types, [&](Type type) { + return isa(type) && type == types.front(); + }); + }; // We only interest in the case where all inputs and outputs have the - // identical types - if (llvm::any_of(castOp->getOperandTypes(), - [&](Type t) { return t != inputTy; }) || - llvm::any_of(castOp->getResultTypes(), - [&](Type t) { return t != outputTy; })) { + // identical VectorTypes + if (!hasIdenticalVectorTypes(inputs) || !hasIdenticalVectorTypes(outputs)) { LDBG("skip unrealized conversion cast op not emulating pack/unpack."); return; } + VectorType outputTy = dyn_cast(outputs[0].getType()); OpBuilder builder(castOp); if (inputs.size() > 1 && outputs.size() == 1) { // the castOp is emulating an unpack op @@ -183,8 +177,10 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); return layout && layout.isWgLayout(); }); - if (hasWgLayoutOperands || hasWgLayoutResults) + if (hasWgLayoutOperands || hasWgLayoutResults) { + LDBG("skip unrolling for op with workgroup level layout: " << *op); return false; + } auto isUnrollable = [](Value value, ArrayRef tileShape) { Type valTy = value.getType(); From 70e84c4105b50e8f40c683f615976ee28bf22e5d Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 2 Jun 2025 14:54:14 +0000 Subject: [PATCH 40/41] refine comments --- mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 7 ++++--- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 12 +++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 79a7c99a8a934..8bdf19ac0e47d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -48,9 +48,10 @@ def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { def XeGPUBlocking: Pass<"xegpu-blocking"> { let summary = "Block XeGPU ops into smaller size."; let description = [{ - The pass unrolls XeGPU ops working on large shapes into ops working on small shapes - (given by the inst_data in the layout attr), such that each of them can be dispatch - into a hardware instruction. + This pass partitions operations that process large shapes into multiple + operations on smaller shapes, as specified by the inst_data in the layout + attribute. This enables each resulting operation to be efficiently mapped + to a hardware instruction. }]; let dependentDialects = [ "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect" diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index fa666d8fa50c0..6e736cb7e6972 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -78,7 +78,14 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } -/// Unroll XeGPU ops to their instruction-level representation. +//===------------------------------------------------------------------------===// +// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops +// to partition operations that process large shapes into multiple operations on +// smaller shapes, as specified by the inst_data in the layout attribute. This +// enables each resulting operation to be efficiently mapped to a hardware +// instruction. +//===------------------------------------------------------------------------===// + class XeGPUBlockingPass final : public xegpu::impl::XeGPUBlockingBase { public: @@ -306,15 +313,18 @@ void XeGPUBlockingPass::runOnOperation() { (void)applyPatternsGreedily(op, std::move(patterns)); op->walk([](Operation *op) { + // Resolve unrealized conversion cast ops emulating pack/unpack if (auto castOp = dyn_cast(op)) resolveUnrealizedConversionCastOp(castOp); + // Remove the layout attributes cached per operands. for (OpOperand &opr : op->getOpOperands()) { std::string name = xegpu::getLayoutName(opr); if (auto layout = op->getAttrOfType(name)) op->removeAttr(name); } + // Update the layout attributes per result. for (OpResult result : op->getOpResults()) { std::string name = xegpu::getLayoutName(result); if (auto layout = op->getAttrOfType(name)) { From 7dd05fa6455aade5a1bd08c8808219f6bd219bfc Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 2 Jun 2025 12:21:40 -0500 Subject: [PATCH 41/41] Update mlir/test/Dialect/XeGPU/xegpu-blocking.mlir Co-authored-by: Adam Siemieniuk --- mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 4fe3844dc1c39..f9114988686c8 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -107,7 +107,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c16 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) { - //CHECK: %22 = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>