From ff1012e2208ef866a0313289d4bf6e130d1a0eaf Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 27 May 2025 23:40:57 +0000 Subject: [PATCH 01/44] add bug fix --- .../Vector/Transforms/VectorDistribute.cpp | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 045c192787f10..1649fb5f91b42 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -15,10 +15,13 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/IR/AffineExpr.h" +#include "mlir/IR/Value.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" #include using namespace mlir; @@ -1554,22 +1557,36 @@ struct WarpOpScfForOp : public WarpDistributionPattern { llvm::SmallSetVector escapingValues; SmallVector inputTypes; SmallVector distTypes; + auto collectEscapingValues = [&](Value value) { + if (!escapingValues.insert(value)) + return; + Type distType = value.getType(); + if (auto vecType = dyn_cast(distType)) { + AffineMap map = distributionMapFn(value); + distType = getDistributedType(vecType, map, warpOp.getWarpSize()); + } + inputTypes.push_back(value.getType()); + distTypes.push_back(distType); + }; + mlir::visitUsedValuesDefinedAbove( forOp.getBodyRegion(), [&](OpOperand *operand) { Operation *parent = operand->get().getParentRegion()->getParentOp(); if (warpOp->isAncestor(parent)) { - if (!escapingValues.insert(operand->get())) - return; - Type distType = operand->get().getType(); - if (auto vecType = dyn_cast(distType)) { - AffineMap map = distributionMapFn(operand->get()); - distType = getDistributedType(vecType, map, warpOp.getWarpSize()); - } - inputTypes.push_back(operand->get().getType()); - distTypes.push_back(distType); + collectEscapingValues(operand->get()); } }); + // Any forOp result that is not already yielded by the warpOp + // region is also considered escaping. + for (OpResult forResult : forOp.getResults()) { + // Check if this forResult is already yielded by the yield op. + if (llvm::is_contained(yield->getOperands(), forResult)) { + continue; + } + collectEscapingValues(forResult); + } + if (llvm::is_contained(distTypes, Type{})) return failure(); @@ -1609,7 +1626,12 @@ struct WarpOpScfForOp : public WarpDistributionPattern { forOp.getResultTypes().end()); llvm::SmallDenseMap argIndexMapping; for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) { - warpInput.push_back(newWarpOp.getResult(retIdx)); + auto newWarpResult = newWarpOp.getResult(retIdx); + // Unused forOp results yielded by the warpOp region are already included + // in the new ForOp. + if (llvm::is_contained(newOperands, newWarpResult)) + continue; + warpInput.push_back(newWarpResult); argIndexMapping[escapingValues[i]] = warpInputType.size(); warpInputType.push_back(inputTypes[i]); } From c6eb53fefded7152c2d627c4094b66f616bc53ed Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 28 May 2025 20:22:47 +0000 Subject: [PATCH 02/44] add test --- .../Vector/vector-warp-distribute.mlir | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index 38771f2593449..6c7ac7a5196a7 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -584,6 +584,42 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 return } +// ----- +// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield( +// CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32> +// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> +// CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32> +// CHECK-PROP: } +// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32> +// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32> +// CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32> +// CHECK-PROP: } +// CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32> +// CHECK-PROP: } +// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> () +func.func @warp_scf_for_unused_yield(%arg0: index) { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { + %ini = "some_def"() : () -> (vector<128xf32>) + %ini1 = "some_def"() : () -> (vector<128xf32>) + %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) { + %add = arith.addi %arg3, %c1 : index + %1 = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>) + %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) + scf.yield %acc, %1 : vector<128xf32>, vector<128xf32> + } + gpu.yield %3#0 : vector<128xf32> + } + "some_use"(%0) : (vector<4xf32>) -> () + return +} + + // ----- // CHECK-PROP-LABEL: func @vector_reduction( From 3bdb5961d48bf70b63560820375d24e0682dbff8 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 28 May 2025 20:26:01 +0000 Subject: [PATCH 03/44] add comments --- mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 1649fb5f91b42..94435588459e6 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -1578,7 +1578,8 @@ struct WarpOpScfForOp : public WarpDistributionPattern { }); // Any forOp result that is not already yielded by the warpOp - // region is also considered escaping. + // region is also considered escaping and must be returned by the + // original warpOp. for (OpResult forResult : forOp.getResults()) { // Check if this forResult is already yielded by the yield op. if (llvm::is_contained(yield->getOperands(), forResult)) { From fe3ab99da99bfe47dd257a458d01ddd4e24df63e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 28 May 2025 21:32:04 +0000 Subject: [PATCH 04/44] remove unsused headers --- mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 94435588459e6..bd833ddb773f7 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -15,13 +15,10 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/IR/AffineExpr.h" -#include "mlir/IR/Value.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Transforms/RegionUtils.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/raw_ostream.h" #include using namespace mlir; From f91b64c88ef893a9a7d620cd76345c21a4a46d33 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 2 Jun 2025 18:24:58 +0000 Subject: [PATCH 05/44] save work --- .../Transforms/XeGPUSubgroupDistribute.cpp | 218 +++++++++++++----- 1 file changed, 164 insertions(+), 54 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 992700524146a..d178c2c33245e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -12,6 +12,8 @@ #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" @@ -30,6 +32,7 @@ #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/ArrayRef.h" @@ -38,6 +41,7 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InterleavedRange.h" +#include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" namespace mlir { @@ -701,7 +705,47 @@ namespace { //===----------------------------------------------------------------------===// // LayoutAttrAssignment //===----------------------------------------------------------------------===// +template +class UpdateTensorDescType : public OpConversionPattern { +public: + UpdateTensorDescType(MLIRContext *context, + function_ref getLayoutOfValue, + TypeConverter &typeConverter, PatternBenefit benefit = 1) + : OpConversionPattern(typeConverter, context, benefit), + getLayoutOfValue(getLayoutOfValue) {} + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Op must have single result. + if (op->getNumResults() != 1) + return failure(); + Type resultType = op->getResult(0).getType(); + // Result type must be a tensor descriptor type. + if (!isa(resultType)) { + LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: " + << resultType << "\n"); + return failure(); + } + auto assignedLayout = getLayoutOfValue(op.getResult()); + if (!assignedLayout) { + LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n"); + return failure(); + } + // Get the original tensor descriptor type. + auto origTensorDescTy = dyn_cast(resultType); + auto newTensorDescTy = xegpu::TensorDescType::get( + origTensorDescTy.getContext(), origTensorDescTy.getShape(), + origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(), + assignedLayout); + rewriter.replaceOpWithNewOp(op, newTensorDescTy, + adaptor.getOperands(), op->getAttrs()); + return success(); + } +private: + function_ref getLayoutOfValue; +}; /// This class is responsible for assigning the layout attributes to the ops and /// their users based on the layout propagation analysis result. class LayoutAttrAssignment { @@ -739,15 +783,19 @@ void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { /// Convert the layout assigned to a value to xegpu::LayoutAttr. xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) { + llvm::errs() << "getLayoutAttrForValue: " << v << "\n"; LayoutInfo layout = getAnalysisResult(v); - if (!layout.isAssigned()) + if (!layout.isAssigned()) { + llvm::errs() << "No layout assigned for value\n"; return {}; + } SmallVector laneLayout, laneData; for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), layout.getDataAsArrayRef())) { laneLayout.push_back(static_cast(layout)); laneData.push_back(static_cast(data)); } + llvm::errs() << "return layout\n"; return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData); } @@ -820,14 +868,23 @@ LogicalResult LayoutAttrAssignment::assign(Operation *op) { /// Walk the IR and attach xegpu::LayoutAttr to all ops and their users. LogicalResult LayoutAttrAssignment::run() { - auto walkResult = top->walk([&](Operation *op) { - if (failed(assign(op))) - return WalkResult::interrupt(); - return WalkResult::advance(); - }); - - if (walkResult.wasInterrupted()) - return failure(); + // auto walkResult = top->walk([&](Operation *op) { + // if (failed(assign(op))) + // return WalkResult::interrupt(); + // return WalkResult::advance(); + // }); + + // if (walkResult.wasInterrupted()) + // return failure(); + // apply the UpdateTensorDescType pattern to all ops + // RewritePatternSet patterns(top->getContext()); + // patterns.add( + // top->getContext(), [&](Value v) -> xegpu::LayoutAttr { + // llvm::errs() << "invoking callback for value\n"; + // return getLayoutAttrForValue(v); + // }); + // if (failed(applyPatternsGreedily(top, std::move(patterns)))) + // return failure(); return resolveConflicts(); } @@ -1597,56 +1654,109 @@ void XeGPUSubgroupDistributePass::runOnOperation() { analyis.printAnalysisResult(os); return; } - auto getPropagatedLayout = [&](Value val) { - return analyis.getLayoutInfo(val); + // auto getPropagatedLayout = [&](Value val) { + // return analyis.getLayoutInfo(val); + // }; + auto getXeGpuLayoutForValue = [&](Value val) -> xegpu::LayoutAttr { + LayoutInfo layout = analyis.getLayoutInfo(val); + if (!layout.isAssigned()) { + llvm::errs() << "No layout assigned for value\n"; + return {}; + } + SmallVector laneLayout, laneData; + for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), + layout.getDataAsArrayRef())) { + laneLayout.push_back(static_cast(layout)); + laneData.push_back(static_cast(data)); + } + return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData); + }; + + ConversionTarget target(getContext()); + target.addDynamicallyLegalOp( + [&](Operation *op) { + return llvm::all_of(op->getResults(), [&](Value val) { + if (auto descType = dyn_cast(val.getType())) { + return descType.getLayoutAttr() != nullptr; + } + return true; // Non-tensor descriptor types are always legal. + }); + }); + target.addLegalOp(); + TypeConverter typeConverter; + typeConverter.addConversion([](Type type) { return type; }); + // // typeConverter.addConversion([](xegpu::TensorDescType type) { + // // return xegpu::TensorDescType::get( + // // type.getContext(), type.getShape(), type.getElementType(), + // // type.getEncoding(), + // // xegpu::LayoutAttr::get(type.getContext(), {1, 1}, {1, 1})); + // // }); + auto addUnrealizedCast = [](OpBuilder &builder, Type type, ValueRange inputs, + Location loc) -> Value { + auto cast = builder.create(loc, type, inputs); + return cast.getResult(0); }; + typeConverter.addSourceMaterialization(addUnrealizedCast); + typeConverter.addTargetMaterialization(addUnrealizedCast); + + RewritePatternSet patterns(&getContext()); + patterns.add, + UpdateTensorDescType>( + &getContext(), getXeGpuLayoutForValue, typeConverter); + if (failed( + applyPartialConversion(getOperation(), target, std::move(patterns)))) + signalPassFailure(); + // Assign xegpu::LayoutAttr to all ops and their users based on the layout // propagation analysis result. - LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout); - if (failed(layoutAssignment.run())) { - signalPassFailure(); - return; - } + // LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout); + // if (failed(layoutAssignment.run())) { + // signalPassFailure(); + // return; + // } // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 // operation. - { - RewritePatternSet patterns(&getContext()); - patterns.add(&getContext()); - - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - signalPassFailure(); - return; - } - // At this point, we have moved the entire function body inside the warpOp. - // Now move any scalar uniform code outside of the warpOp (like GPU index - // ops, scalar constants, etc.). This will simplify the later lowering and - // avoid custom patterns for these ops. - getOperation()->walk([&](Operation *op) { - if (auto warpOp = dyn_cast(op)) { - vector::moveScalarUniformCode(warpOp); - } - }); - } - // Finally, do the SIMD to SIMT distribution. - RewritePatternSet patterns(&getContext()); - xegpu::populateXeGPUSubgroupDistributePatterns(patterns); - // TODO: distributionFn and shuffleFn are not used at this point. - auto distributionFn = [](Value val) { - VectorType vecType = dyn_cast(val.getType()); - int64_t vecRank = vecType ? vecType.getRank() : 0; - OpBuilder builder(val.getContext()); - if (vecRank == 0) - return AffineMap::get(val.getContext()); - return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); - }; - auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, - int64_t warpSz) { return Value(); }; - vector::populatePropagateWarpVectorDistributionPatterns( - patterns, distributionFn, shuffleFn); - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - signalPassFailure(); - return; - } + // { + // RewritePatternSet patterns(&getContext()); + // patterns.add(&getContext()); + + // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + // signalPassFailure(); + // return; + // } + // // At this point, we have moved the entire function body inside the + // warpOp. + // // Now move any scalar uniform code outside of the warpOp (like GPU index + // // ops, scalar constants, etc.). This will simplify the later lowering + // and + // // avoid custom patterns for these ops. + // getOperation()->walk([&](Operation *op) { + // if (auto warpOp = dyn_cast(op)) { + // vector::moveScalarUniformCode(warpOp); + // } + // }); + // } + // // Finally, do the SIMD to SIMT distribution. + // RewritePatternSet patterns(&getContext()); + // xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + // // TODO: distributionFn and shuffleFn are not used at this point. + // auto distributionFn = [](Value val) { + // VectorType vecType = dyn_cast(val.getType()); + // int64_t vecRank = vecType ? vecType.getRank() : 0; + // OpBuilder builder(val.getContext()); + // if (vecRank == 0) + // return AffineMap::get(val.getContext()); + // return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); + // }; + // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value + // srcIdx, + // int64_t warpSz) { return Value(); }; + // vector::populatePropagateWarpVectorDistributionPatterns( + // patterns, distributionFn, shuffleFn); + // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + // signalPassFailure(); + // return; + // } } From 5cacace6c3f56f3d84b2a63003c2f3d9947b195a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 2 Jun 2025 22:57:27 +0000 Subject: [PATCH 06/44] initial version --- .../Transforms/XeGPUSubgroupDistribute.cpp | 487 ++++++++++-------- 1 file changed, 267 insertions(+), 220 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index d178c2c33245e..aa982ae779d1e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -32,6 +32,7 @@ #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/InliningUtils.h" @@ -700,203 +701,264 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { } } -namespace { +// namespace { //===----------------------------------------------------------------------===// // LayoutAttrAssignment //===----------------------------------------------------------------------===// -template -class UpdateTensorDescType : public OpConversionPattern { -public: - UpdateTensorDescType(MLIRContext *context, - function_ref getLayoutOfValue, - TypeConverter &typeConverter, PatternBenefit benefit = 1) - : OpConversionPattern(typeConverter, context, benefit), - getLayoutOfValue(getLayoutOfValue) {} - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - // Op must have single result. - if (op->getNumResults() != 1) - return failure(); - Type resultType = op->getResult(0).getType(); - // Result type must be a tensor descriptor type. - if (!isa(resultType)) { - LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: " - << resultType << "\n"); - return failure(); +// template +// class UpdateTensorDescType : public OpConversionPattern { +// public: +// UpdateTensorDescType(MLIRContext *context, +// function_ref +// getLayoutOfValue, TypeConverter &typeConverter, +// PatternBenefit benefit = 1) +// : OpConversionPattern(typeConverter, context, benefit), +// getLayoutOfValue(getLayoutOfValue) {} +// using OpConversionPattern::OpConversionPattern; +// LogicalResult +// matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, +// ConversionPatternRewriter &rewriter) const override { +// // Op must have single result. +// if (op->getNumResults() != 1) +// return failure(); +// Type resultType = op->getResult(0).getType(); +// // Result type must be a tensor descriptor type. +// if (!isa(resultType)) { +// LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: " +// << resultType << "\n"); +// return failure(); +// } +// auto assignedLayout = getLayoutOfValue(op.getResult()); +// if (!assignedLayout) { +// LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n"); +// return failure(); +// } +// // Get the original tensor descriptor type. +// auto origTensorDescTy = dyn_cast(resultType); +// auto newTensorDescTy = xegpu::TensorDescType::get( +// origTensorDescTy.getContext(), origTensorDescTy.getShape(), +// origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(), +// assignedLayout); +// rewriter.replaceOpWithNewOp(op, newTensorDescTy, +// adaptor.getOperands(), op->getAttrs()); +// return success(); +// } + +// private: +// function_ref getLayoutOfValue; +// }; +// /// This class is responsible for assigning the layout attributes to the ops +// and +// /// their users based on the layout propagation analysis result. +// class LayoutAttrAssignment { +// public: +// LayoutAttrAssignment(Operation *top, +// function_ref getLayout) +// : getAnalysisResult(getLayout), top(top) {} + +// LogicalResult run(); + +// private: +// LogicalResult assign(Operation *op); +// void assignToUsers(Value v, xegpu::LayoutAttr layout); +// xegpu::LayoutAttr getLayoutAttrForValue(Value v); +// LogicalResult resolveConflicts(); +// // Callable to get the layout of a value based on the layout propagation +// // analysis. +// function_ref getAnalysisResult; +// Operation *top; +// }; + +// } // namespace + +// /// Helper to assign the layout attribute to the users of the value. +// void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { +// for (OpOperand &user : v.getUses()) { +// Operation *owner = user.getOwner(); +// unsigned operandNumber = user.getOperandNumber(); +// // Use a generic name for ease of querying the layout attribute later. +// std::string attrName = +// operandLayoutNamePrefix + std::to_string(operandNumber); +// owner->setAttr(attrName, layout); +// } +// } + +// /// Convert the layout assigned to a value to xegpu::LayoutAttr. +// xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) { +// llvm::errs() << "getLayoutAttrForValue: " << v << "\n"; +// LayoutInfo layout = getAnalysisResult(v); +// if (!layout.isAssigned()) { +// llvm::errs() << "No layout assigned for value\n"; +// return {}; +// } +// SmallVector laneLayout, laneData; +// for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), +// layout.getDataAsArrayRef())) { +// laneLayout.push_back(static_cast(layout)); +// laneData.push_back(static_cast(data)); +// } +// llvm::errs() << "return layout\n"; +// return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData); +// } + +// /// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned +// /// based on the layout propagation analysis result. +// LogicalResult LayoutAttrAssignment::assign(Operation *op) { +// // For function ops, propagate the function argument layout to the users. +// if (auto func = dyn_cast(op)) { +// for (BlockArgument arg : func.getArguments()) { +// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg); +// if (layoutInfo) { +// assignToUsers(arg, layoutInfo); +// } +// } +// return success(); +// } +// // If no results, move on. +// if (op->getNumResults() == 0) +// return success(); +// // If all the results are scalars, move on. +// if (llvm::all_of(op->getResultTypes(), +// [](Type t) { return t.isIntOrIndexOrFloat(); })) +// return success(); +// // If the op has more than one result and at least one result is a tensor +// // descriptor, exit. This case is not supported yet. +// // TODO: Support this case. +// if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type +// t) { +// return isa(t); +// })) { +// LLVM_DEBUG( +// DBGS() << op->getName() +// << " op has more than one result and at least one is a tensor +// " +// "descriptor. This case is not handled.\n"); +// return failure(); +// } +// // If the result is a tensor descriptor, attach the layout to the tensor +// // descriptor itself. +// if (auto tensorDescTy = +// dyn_cast(op->getResultTypes()[0])) { +// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0)); +// if (!layoutInfo) { +// LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n"); +// return failure(); +// } + +// // Clone the op, attach the layout to the result tensor descriptor, and +// // remove the original op. +// OpBuilder builder(op); +// Operation *newOp = builder.clone(*op); +// auto newTensorDescTy = xegpu::TensorDescType::get( +// tensorDescTy.getContext(), tensorDescTy.getShape(), +// tensorDescTy.getElementType(), tensorDescTy.getEncoding(), +// layoutInfo); +// newOp->getResult(0).setType(newTensorDescTy); +// op->replaceAllUsesWith(newOp->getResults()); +// op->erase(); +// return success(); +// } +// // Otherwise simply attach the layout to the op itself. +// for (auto [i, r] : llvm::enumerate(op->getResults())) { +// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r); +// if (layoutInfo) { +// std::string attrName = resultLayoutNamePrefix + std::to_string(i); +// op->setAttr(attrName, layoutInfo); +// // Attach the layout attribute to the users of the result. +// assignToUsers(r, layoutInfo); +// } +// } +// return success(); +// } + +// /// Walk the IR and attach xegpu::LayoutAttr to all ops and their users. +// LogicalResult LayoutAttrAssignment::run() { +// // auto walkResult = top->walk([&](Operation *op) { +// // if (failed(assign(op))) +// // return WalkResult::interrupt(); +// // return WalkResult::advance(); +// // }); + +// // if (walkResult.wasInterrupted()) +// // return failure(); +// // apply the UpdateTensorDescType pattern to all ops +// // RewritePatternSet patterns(top->getContext()); +// // patterns.add( +// // top->getContext(), [&](Value v) -> xegpu::LayoutAttr { +// // llvm::errs() << "invoking callback for value\n"; +// // return getLayoutAttrForValue(v); +// // }); +// // if (failed(applyPatternsGreedily(top, std::move(patterns)))) +// // return failure(); + +// return resolveConflicts(); +// } + +// /// TODO: Implement the layout conflict resolution. This must ensure mainly +// two +// /// things: +// /// 1) Is a given layout supported by the op? (need to query the target +// /// HW info). Otherwise can we achieve this layout using a layout +// conversion? +// /// 2) Do all the operands have the required layout? If not, can it +// /// be resolved using a layout conversion? +// LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); } +using GetLayoutCallbackFnTy = function_ref; +static void handleBranchTerminatorOpInterface( + mlir::OpBuilder &builder, + mlir::RegionBranchTerminatorOpInterface terminator, + GetLayoutCallbackFnTy getLayoutOfValue) {} +static void handleBranchOpInterface(mlir::OpBuilder &builder, + mlir::RegionBranchOpInterface branch, + GetLayoutCallbackFnTy getLayoutOfValue) {} +static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block, + GetLayoutCallbackFnTy getLayoutOfValue) {} +static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, + GetLayoutCallbackFnTy getLayoutOfValue) { + + auto updateValue = [&](Value v, unsigned vIndex, + const std::string &layoutAttrName) { + // Layouts are needed only for vector and tensor descriptor types. + if (!isa(v.getType())) + return; + xegpu::LayoutAttr layout = getLayoutOfValue(v); + if (!layout) { + // TODO : handle error. + LLVM_DEBUG(DBGS() << "Expecting layout for value: " << v + << " but got none.\n"); + return; } - auto assignedLayout = getLayoutOfValue(op.getResult()); - if (!assignedLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n"); - return failure(); + auto tensorDescTy = dyn_cast(v.getType()); + + if (tensorDescTy) { + auto newTensorDescTy = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + v.setType(newTensorDescTy); + return; } - // Get the original tensor descriptor type. - auto origTensorDescTy = dyn_cast(resultType); - auto newTensorDescTy = xegpu::TensorDescType::get( - origTensorDescTy.getContext(), origTensorDescTy.getShape(), - origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(), - assignedLayout); - rewriter.replaceOpWithNewOp(op, newTensorDescTy, - adaptor.getOperands(), op->getAttrs()); - return success(); - } - -private: - function_ref getLayoutOfValue; -}; -/// This class is responsible for assigning the layout attributes to the ops and -/// their users based on the layout propagation analysis result. -class LayoutAttrAssignment { -public: - LayoutAttrAssignment(Operation *top, - function_ref getLayout) - : getAnalysisResult(getLayout), top(top) {} - - LogicalResult run(); - -private: - LogicalResult assign(Operation *op); - void assignToUsers(Value v, xegpu::LayoutAttr layout); - xegpu::LayoutAttr getLayoutAttrForValue(Value v); - LogicalResult resolveConflicts(); - // Callable to get the layout of a value based on the layout propagation - // analysis. - function_ref getAnalysisResult; - Operation *top; -}; - -} // namespace - -/// Helper to assign the layout attribute to the users of the value. -void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { - for (OpOperand &user : v.getUses()) { - Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Use a generic name for ease of querying the layout attribute later. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); - owner->setAttr(attrName, layout); - } -} - -/// Convert the layout assigned to a value to xegpu::LayoutAttr. -xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) { - llvm::errs() << "getLayoutAttrForValue: " << v << "\n"; - LayoutInfo layout = getAnalysisResult(v); - if (!layout.isAssigned()) { - llvm::errs() << "No layout assigned for value\n"; - return {}; - } - SmallVector laneLayout, laneData; - for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), - layout.getDataAsArrayRef())) { - laneLayout.push_back(static_cast(layout)); - laneData.push_back(static_cast(data)); - } - llvm::errs() << "return layout\n"; - return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData); -} + // If type is vector, add a temporary layout attribute to the op. + op->setAttr(layoutAttrName, layout); + }; -/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned -/// based on the layout propagation analysis result. -LogicalResult LayoutAttrAssignment::assign(Operation *op) { - // For function ops, propagate the function argument layout to the users. - if (auto func = dyn_cast(op)) { - for (BlockArgument arg : func.getArguments()) { - xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg); - if (layoutInfo) { - assignToUsers(arg, layoutInfo); - } - } - return success(); - } - // If no results, move on. - if (op->getNumResults() == 0) - return success(); - // If all the results are scalars, move on. - if (llvm::all_of(op->getResultTypes(), - [](Type t) { return t.isIntOrIndexOrFloat(); })) - return success(); - // If the op has more than one result and at least one result is a tensor - // descriptor, exit. This case is not supported yet. - // TODO: Support this case. - if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) { - return isa(t); - })) { - LLVM_DEBUG( - DBGS() << op->getName() - << " op has more than one result and at least one is a tensor " - "descriptor. This case is not handled.\n"); - return failure(); + // Iterate over all the operands. + for (OpOperand &operand : op->getOpOperands()) { + unsigned operandIndex = operand.getOperandNumber(); + std::string operandLayoutName = + operandLayoutNamePrefix + std::to_string(operandIndex); + updateValue(operand.get(), operandIndex, operandLayoutName); } - // If the result is a tensor descriptor, attach the layout to the tensor - // descriptor itself. - if (auto tensorDescTy = - dyn_cast(op->getResultTypes()[0])) { - xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0)); - if (!layoutInfo) { - LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n"); - return failure(); - } - // Clone the op, attach the layout to the result tensor descriptor, and - // remove the original op. - OpBuilder builder(op); - Operation *newOp = builder.clone(*op); - auto newTensorDescTy = xegpu::TensorDescType::get( - tensorDescTy.getContext(), tensorDescTy.getShape(), - tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo); - newOp->getResult(0).setType(newTensorDescTy); - op->replaceAllUsesWith(newOp->getResults()); - op->erase(); - return success(); + // Iterate over all the results. + for (OpResult result : op->getResults()) { + unsigned resultIndex = result.getResultNumber(); + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(resultIndex); + updateValue(result, resultIndex, resultLayoutName); } - // Otherwise simply attach the layout to the op itself. - for (auto [i, r] : llvm::enumerate(op->getResults())) { - xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r); - if (layoutInfo) { - std::string attrName = resultLayoutNamePrefix + std::to_string(i); - op->setAttr(attrName, layoutInfo); - // Attach the layout attribute to the users of the result. - assignToUsers(r, layoutInfo); - } - } - return success(); } -/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users. -LogicalResult LayoutAttrAssignment::run() { - // auto walkResult = top->walk([&](Operation *op) { - // if (failed(assign(op))) - // return WalkResult::interrupt(); - // return WalkResult::advance(); - // }); - - // if (walkResult.wasInterrupted()) - // return failure(); - // apply the UpdateTensorDescType pattern to all ops - // RewritePatternSet patterns(top->getContext()); - // patterns.add( - // top->getContext(), [&](Value v) -> xegpu::LayoutAttr { - // llvm::errs() << "invoking callback for value\n"; - // return getLayoutAttrForValue(v); - // }); - // if (failed(applyPatternsGreedily(top, std::move(patterns)))) - // return failure(); - - return resolveConflicts(); -} - -/// TODO: Implement the layout conflict resolution. This must ensure mainly two -/// things: -/// 1) Is a given layout supported by the op? (need to query the target -/// HW info). Otherwise can we achieve this layout using a layout conversion? -/// 2) Do all the operands have the required layout? If not, can it -/// be resolved using a layout conversion? -LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); } - namespace { //===----------------------------------------------------------------------===// @@ -1657,10 +1719,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // auto getPropagatedLayout = [&](Value val) { // return analyis.getLayoutInfo(val); // }; - auto getXeGpuLayoutForValue = [&](Value val) -> xegpu::LayoutAttr { + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analyis.getLayoutInfo(val); if (!layout.isAssigned()) { - llvm::errs() << "No layout assigned for value\n"; + llvm::errs() << "No layout assigned for value" << val << "\n"; return {}; } SmallVector laneLayout, laneData; @@ -1672,41 +1734,26 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData); }; - ConversionTarget target(getContext()); - target.addDynamicallyLegalOp( - [&](Operation *op) { - return llvm::all_of(op->getResults(), [&](Value val) { - if (auto descType = dyn_cast(val.getType())) { - return descType.getLayoutAttr() != nullptr; - } - return true; // Non-tensor descriptor types are always legal. - }); - }); - target.addLegalOp(); - TypeConverter typeConverter; - typeConverter.addConversion([](Type type) { return type; }); - // // typeConverter.addConversion([](xegpu::TensorDescType type) { - // // return xegpu::TensorDescType::get( - // // type.getContext(), type.getShape(), type.getElementType(), - // // type.getEncoding(), - // // xegpu::LayoutAttr::get(type.getContext(), {1, 1}, {1, 1})); - // // }); - auto addUnrealizedCast = [](OpBuilder &builder, Type type, ValueRange inputs, - Location loc) -> Value { - auto cast = builder.create(loc, type, inputs); - return cast.getResult(0); - }; + mlir::OpBuilder builder(&getContext()); + Operation *op = getOperation(); + op->walk([&](mlir::Block *block) { + for (mlir::Operation &op : llvm::reverse(block->getOperations())) { + if (auto terminator = + mlir::dyn_cast(op)) { + handleBranchTerminatorOpInterface(builder, terminator, + getXeGPULayoutForValue); + continue; + } - typeConverter.addSourceMaterialization(addUnrealizedCast); - typeConverter.addTargetMaterialization(addUnrealizedCast); + if (auto iface = mlir::dyn_cast(op)) { + handleBranchOpInterface(builder, iface, getXeGPULayoutForValue); + continue; + } + updateOp(builder, &op, getXeGPULayoutForValue); + } - RewritePatternSet patterns(&getContext()); - patterns.add, - UpdateTensorDescType>( - &getContext(), getXeGpuLayoutForValue, typeConverter); - if (failed( - applyPartialConversion(getOperation(), target, std::move(patterns)))) - signalPassFailure(); + updateBlockTypes(builder, *block, getXeGPULayoutForValue); + }); // Assign xegpu::LayoutAttr to all ops and their users based on the layout // propagation analysis result. From 7d54194f0c726db4461015de87abf9ad380bbfa3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 3 Jun 2025 19:50:30 +0000 Subject: [PATCH 07/44] working version --- .../Transforms/XeGPUSubgroupDistribute.cpp | 159 +++++++++++++----- 1 file changed, 120 insertions(+), 39 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index aa982ae779d1e..6b3ff8312e365 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -40,6 +40,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InterleavedRange.h" #include "llvm/Support/LogicalResult.h" @@ -905,59 +906,140 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { // /// be resolved using a layout conversion? // LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); } using GetLayoutCallbackFnTy = function_ref; +static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, + GetLayoutCallbackFnTy getLayoutOfValue) { + + // Iterate over all the results. + for (OpResult result : op->getResults()) { + Type resultType = result.getType(); + // Layouts are needed only for vector and tensor descriptor types. + if (!isa(resultType)) + continue; + // If the result has any users, we expect it to have a layout. + xegpu::LayoutAttr layout = getLayoutOfValue(result); + if (!layout && result.getNumUses() > 0) { + LLVM_DEBUG(DBGS() << "Expecting layout for result: " << result + << " but got none.\n"); + continue; + } + if (auto tensorDescTy = dyn_cast(resultType)) { + // TODO: Handle error. + auto typeWithLayout = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + result.setType(typeWithLayout); + continue; + } + // If the result is a vector type, add a temporary layout attribute to the + // op. + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(result.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : result.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temorary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } + } +} static void handleBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, GetLayoutCallbackFnTy getLayoutOfValue) {} static void handleBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, - GetLayoutCallbackFnTy getLayoutOfValue) {} -static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block, - GetLayoutCallbackFnTy getLayoutOfValue) {} -static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, - GetLayoutCallbackFnTy getLayoutOfValue) { + GetLayoutCallbackFnTy getLayoutOfValue) { + mlir::Operation *op = branch.getOperation(); + llvm::SmallVector successors; + llvm::SmallVector operands(op->getNumOperands(), nullptr); + branch.getEntrySuccessorRegions(operands, successors); + DenseMap resultToLayouts; + mlir::ValueRange results = op->getResults(); + + for (mlir::RegionSuccessor &successor : successors) { + if (successor.isParent()) + continue; - auto updateValue = [&](Value v, unsigned vIndex, - const std::string &layoutAttrName) { - // Layouts are needed only for vector and tensor descriptor types. - if (!isa(v.getType())) - return; - xegpu::LayoutAttr layout = getLayoutOfValue(v); + mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor); + mlir::ValueRange blockArgs = successor.getSuccessorInputs(); + unsigned index = 0; + + for (auto [initArg, blockArg, result] : + llvm::zip(initArgs, blockArgs, results)) { + Type inputType = blockArg.getType(); + if (!isa(inputType)) + continue; + xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(blockArg); + xegpu::LayoutAttr initArgLayout = getLayoutOfValue(initArg); + + if (!blockArgLayout || !initArgLayout) { + LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << blockArg + << " or init arg: " << initArg << "\n"); + continue; + } + + // TOOD: We expect these two to match. Data flow analysis will ensure + // this. + assert(blockArgLayout == initArgLayout && + "Expexing block arg and init arg to have the same layout."); + // Get tensor descriptor type with the layout. + auto tdescTy = dyn_cast(inputType); + auto newTdescTy = xegpu::TensorDescType::get( + tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), + tdescTy.getEncoding(), blockArgLayout); + blockArg.setType(newTdescTy); + // Store the layout for the result. + if (resultToLayouts.count(result) != 0 && + resultToLayouts[result] != blockArgLayout) { + LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result + << " - " << resultToLayouts[result] << " vs " + << blockArgLayout << "\n"); + } else { + resultToLayouts[result] = blockArgLayout; + } + } + } + for (auto [i, r] : llvm::enumerate(op->getResults())) { + Type resultType = r.getType(); + if (!isa(resultType)) + continue; + xegpu::LayoutAttr layout = getLayoutOfValue(r); + if (!layout) + layout = resultToLayouts[r]; if (!layout) { - // TODO : handle error. - LLVM_DEBUG(DBGS() << "Expecting layout for value: " << v - << " but got none.\n"); - return; + LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: " + << r << "\n"); + continue; } - auto tensorDescTy = dyn_cast(v.getType()); - - if (tensorDescTy) { - auto newTensorDescTy = xegpu::TensorDescType::get( + if (auto tensorDescTy = dyn_cast(resultType)) { + auto newTdescTy = xegpu::TensorDescType::get( tensorDescTy.getContext(), tensorDescTy.getShape(), tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - v.setType(newTensorDescTy); - return; + r.setType(newTdescTy); + continue; } - // If type is vector, add a temporary layout attribute to the op. - op->setAttr(layoutAttrName, layout); - }; - - // Iterate over all the operands. - for (OpOperand &operand : op->getOpOperands()) { - unsigned operandIndex = operand.getOperandNumber(); - std::string operandLayoutName = - operandLayoutNamePrefix + std::to_string(operandIndex); - updateValue(operand.get(), operandIndex, operandLayoutName); - } - - // Iterate over all the results. - for (OpResult result : op->getResults()) { - unsigned resultIndex = result.getResultNumber(); + // If the result is a vector type, add a temporary layout attribute to the + // op. std::string resultLayoutName = - resultLayoutNamePrefix + std::to_string(resultIndex); - updateValue(result, resultIndex, resultLayoutName); + resultLayoutNamePrefix + std::to_string(r.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : r.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temporary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } } } +static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block, + GetLayoutCallbackFnTy getLayoutOfValue) {} namespace { @@ -1722,7 +1804,6 @@ void XeGPUSubgroupDistributePass::runOnOperation() { auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analyis.getLayoutInfo(val); if (!layout.isAssigned()) { - llvm::errs() << "No layout assigned for value" << val << "\n"; return {}; } SmallVector laneLayout, laneData; From b289399e44bf56e91149cbfc37a729c14949c4d2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 3 Jun 2025 21:36:11 +0000 Subject: [PATCH 08/44] working expect for unreal cast --- .../Transforms/XeGPUSubgroupDistribute.cpp | 97 +++++++++---------- 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 6b3ff8312e365..dfb7b0668d2be 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1291,11 +1291,14 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { xegpu::TensorDescType distributedTensorDescTy = descOp.getType().dropLayouts(); // Distributed tensor descriptor type // does not contain layout info. - auto newDescOp = rewriter.create( + Value newDescOp = rewriter.create( newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands, descOp->getAttrs()); Value distributedVal = newWarpOp.getResult(operandIdx); + // Resolve the distributed type to the expected type. + newDescOp = + resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter); rewriter.replaceAllUsesWith(distributedVal, newDescOp); return success(); } @@ -1697,10 +1700,13 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { } } // Create a new update op outside the warp op. - auto newUpdateOp = rewriter.create( + Value newUpdateOp = rewriter.create( newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands, removeTemporaryLayoutAttributes(updateOp->getAttrs())); Value distributedVal = newWarpOp.getResult(operandIdx); + // Resolve the distributed type with the original type. + newUpdateOp = + resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter); rewriter.replaceAllUsesWith(distributedVal, newUpdateOp); return success(); } @@ -1836,55 +1842,44 @@ void XeGPUSubgroupDistributePass::runOnOperation() { updateBlockTypes(builder, *block, getXeGPULayoutForValue); }); - // Assign xegpu::LayoutAttr to all ops and their users based on the layout - // propagation analysis result. - // LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout); - // if (failed(layoutAssignment.run())) { - // signalPassFailure(); - // return; - // } - // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 // operation. - // { - // RewritePatternSet patterns(&getContext()); - // patterns.add(&getContext()); - - // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - // signalPassFailure(); - // return; - // } - // // At this point, we have moved the entire function body inside the - // warpOp. - // // Now move any scalar uniform code outside of the warpOp (like GPU index - // // ops, scalar constants, etc.). This will simplify the later lowering - // and - // // avoid custom patterns for these ops. - // getOperation()->walk([&](Operation *op) { - // if (auto warpOp = dyn_cast(op)) { - // vector::moveScalarUniformCode(warpOp); - // } - // }); - // } - // // Finally, do the SIMD to SIMT distribution. - // RewritePatternSet patterns(&getContext()); - // xegpu::populateXeGPUSubgroupDistributePatterns(patterns); - // // TODO: distributionFn and shuffleFn are not used at this point. - // auto distributionFn = [](Value val) { - // VectorType vecType = dyn_cast(val.getType()); - // int64_t vecRank = vecType ? vecType.getRank() : 0; - // OpBuilder builder(val.getContext()); - // if (vecRank == 0) - // return AffineMap::get(val.getContext()); - // return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); - // }; - // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value - // srcIdx, - // int64_t warpSz) { return Value(); }; - // vector::populatePropagateWarpVectorDistributionPatterns( - // patterns, distributionFn, shuffleFn); - // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - // signalPassFailure(); - // return; - // } + { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + return; + } + // At this point, we have moved the entire function body inside the + // warpOp. Now move any scalar uniform code outside of the warpOp (like GPU + // index ops, scalar constants, etc.). This will simplify the later lowering + // and avoid custom patterns for these ops. + getOperation()->walk([&](Operation *op) { + if (auto warpOp = dyn_cast(op)) { + vector::moveScalarUniformCode(warpOp); + } + }); + } + // Finally, do the SIMD to SIMT distribution. + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + // TODO: distributionFn and shuffleFn are not used at this point. + auto distributionFn = [](Value val) { + VectorType vecType = dyn_cast(val.getType()); + int64_t vecRank = vecType ? vecType.getRank() : 0; + OpBuilder builder(val.getContext()); + if (vecRank == 0) + return AffineMap::get(val.getContext()); + return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); + }; + auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, + int64_t warpSz) { return Value(); }; + vector::populatePropagateWarpVectorDistributionPatterns( + patterns, distributionFn, shuffleFn); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + return; + } } From 4318343ead59cda8f70741ca45e9255a6ce66bba Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 3 Jun 2025 22:57:01 +0000 Subject: [PATCH 09/44] some fixes --- .../Transforms/XeGPUSubgroupDistribute.cpp | 70 ++++++++++++++++--- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index dfb7b0668d2be..56ec1eaa118e5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -68,8 +68,14 @@ constexpr unsigned packedSizeInBitsForDefault = 16; // Minimum packing size per register for DPAS A. constexpr unsigned packedSizeInBitsForDpasB = 32; // Minimum packing size per register for DPAS B. -static const char *const operandLayoutNamePrefix = "layout_operand_"; -static const char *const resultLayoutNamePrefix = "layout_result_"; +static const char *const operandLayoutNamePrefix = + "layout_operand_"; // Attribute name for identifying operand layouts. +static const char *const resultLayoutNamePrefix = + "layout_result_"; // Attribute name for identifying result layouts. +static const char *const resolveSIMTTypeMismatch = + "resolve_simt_type_mismatch"; // Attribute name for identifying + // UnrelizedConversionCastOp added to resolve + // SIMT type mismatches. namespace { @@ -946,11 +952,11 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } } } -static void handleBranchTerminatorOpInterface( +static void updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, GetLayoutCallbackFnTy getLayoutOfValue) {} -static void handleBranchOpInterface(mlir::OpBuilder &builder, +static void updateBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, GetLayoutCallbackFnTy getLayoutOfValue) { mlir::Operation *op = branch.getOperation(); @@ -966,7 +972,6 @@ static void handleBranchOpInterface(mlir::OpBuilder &builder, mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor); mlir::ValueRange blockArgs = successor.getSuccessorInputs(); - unsigned index = 0; for (auto [initArg, blockArg, result] : llvm::zip(initArgs, blockArgs, results)) { @@ -1117,6 +1122,7 @@ static Value resolveDistributedTy(Value orig, T expected, if (isa(orig.getType())) { auto castOp = rewriter.create(orig.getLoc(), expected, orig); + castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr()); return castOp.getResult(0); } llvm_unreachable("Unsupported type for reconciliation"); @@ -1804,9 +1810,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { analyis.printAnalysisResult(os); return; } - // auto getPropagatedLayout = [&](Value val) { - // return analyis.getLayoutInfo(val); - // }; + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analyis.getLayoutInfo(val); if (!layout.isAssigned()) { @@ -1827,13 +1831,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() { for (mlir::Operation &op : llvm::reverse(block->getOperations())) { if (auto terminator = mlir::dyn_cast(op)) { - handleBranchTerminatorOpInterface(builder, terminator, + updateBranchTerminatorOpInterface(builder, terminator, getXeGPULayoutForValue); continue; } if (auto iface = mlir::dyn_cast(op)) { - handleBranchOpInterface(builder, iface, getXeGPULayoutForValue); + updateBranchOpInterface(builder, iface, getXeGPULayoutForValue); continue; } updateOp(builder, &op, getXeGPULayoutForValue); @@ -1882,4 +1886,50 @@ void XeGPUSubgroupDistributePass::runOnOperation() { signalPassFailure(); return; } + + // Clean up UnrealizedConversionCastOps that were inserted due to tensor desc + // type mismatches created by using upstream distribution patterns (scf.for) + getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { + // We are only interested in UnrealizedConversionCastOps there were added + // for resolving SIMT type mismatches. + if (!op->getAttr(resolveSIMTTypeMismatch)) + return WalkResult::skip(); + + Value input = op.getOperand(0); + Value output = op.getResult(0); + + // Both input and output must have tensor descriptor types. + xegpu::TensorDescType inputDescType = + mlir::dyn_cast(input.getType()); + xegpu::TensorDescType outputDescType = + mlir::dyn_cast(output.getType()); + assert(inputDescType && outputDescType && + "Unrealized conversion cast must have tensor descriptor types"); + + // tensor_desc -> tensor_desc Type of conversions. + // This occurs iside scf.for body to resolve the block argument type to SIMT + // type. + if (inputDescType.getLayout()) { + auto argument = mlir::dyn_cast(input); + if (argument) { + argument.setType(output.getType()); + output.replaceAllUsesWith(argument); + if (auto loopOp = mlir::dyn_cast( + argument.getOwner()->getParentOp())) { + auto result = loopOp.getTiedLoopResult(argument); + result.setType(output.getType()); + } + } + } + + // tensor_desc -> tensor_desc Type of + // conversions. This occurs at the yield op of scf.for body to go back from + // SIMT type to original type. + if (outputDescType.getLayout()) + output.replaceAllUsesWith(input); + + if (op->use_empty()) + op->erase(); + return WalkResult::advance(); + }); } From 20a641545534132b59c934d7bc31b6c088134605 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 4 Jun 2025 00:01:15 +0000 Subject: [PATCH 10/44] branch terminator iface --- .../Transforms/XeGPUSubgroupDistribute.cpp | 332 ++++++++++-------- 1 file changed, 193 insertions(+), 139 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 56ec1eaa118e5..27d912b87c6dc 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -955,7 +955,54 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, static void updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, - GetLayoutCallbackFnTy getLayoutOfValue) {} + GetLayoutCallbackFnTy getLayoutOfValue) { + if (!mlir::isa(terminator->getParentOp())) + return; + + llvm::SmallVector successors; + llvm::SmallVector operands(terminator->getNumOperands(), + nullptr); + terminator.getSuccessorRegions(operands, successors); + + for (mlir::RegionSuccessor &successor : successors) { + if (!successor.isParent()) + continue; + + mlir::OperandRange operands = terminator.getSuccessorOperands(successor); + mlir::ValueRange inputs = successor.getSuccessorInputs(); + for (auto [operand, input] : llvm::zip(operands, inputs)) { + // print arg and inp + // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n"; + Type inputType = input.getType(); + if (!isa(inputType)) + continue; + xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); + xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); + + if (!operandLayout) { + LLVM_DEBUG(DBGS() << "Expecting layout for region successor operand : " + << operand << " but got none.\n"); + continue; + } + + if (inputLayout && inputLayout != operandLayout) { + LLVM_DEBUG( + DBGS() + << "Conflicting layouts for region successor operand and input: " + << inputLayout << " vs " << operandLayout << "\n"); + continue; + } + llvm::errs() << "Setting layout for input to " + << ": " << operandLayout << "\n"; + // Get tensor descriptor type with the layout. + auto tdescTy = dyn_cast(inputType); + auto newTdescTy = xegpu::TensorDescType::get( + tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), + tdescTy.getEncoding(), operandLayout); + input.setType(newTdescTy); + } + } +} static void updateBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, GetLayoutCallbackFnTy getLayoutOfValue) { @@ -970,20 +1017,19 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, if (successor.isParent()) continue; - mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor); - mlir::ValueRange blockArgs = successor.getSuccessorInputs(); + mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor); + mlir::ValueRange inputs = successor.getSuccessorInputs(); - for (auto [initArg, blockArg, result] : - llvm::zip(initArgs, blockArgs, results)) { - Type inputType = blockArg.getType(); + for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) { + Type inputType = input.getType(); if (!isa(inputType)) continue; - xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(blockArg); - xegpu::LayoutAttr initArgLayout = getLayoutOfValue(initArg); + xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(input); + xegpu::LayoutAttr initArgLayout = getLayoutOfValue(operand); if (!blockArgLayout || !initArgLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << blockArg - << " or init arg: " << initArg << "\n"); + LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input + << " or init arg: " << operand << "\n"); continue; } @@ -996,52 +1042,54 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, auto newTdescTy = xegpu::TensorDescType::get( tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), tdescTy.getEncoding(), blockArgLayout); - blockArg.setType(newTdescTy); + input.setType(newTdescTy); // Store the layout for the result. - if (resultToLayouts.count(result) != 0 && - resultToLayouts[result] != blockArgLayout) { - LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result - << " - " << resultToLayouts[result] << " vs " - << blockArgLayout << "\n"); - } else { - resultToLayouts[result] = blockArgLayout; - } - } - } - for (auto [i, r] : llvm::enumerate(op->getResults())) { - Type resultType = r.getType(); - if (!isa(resultType)) - continue; - xegpu::LayoutAttr layout = getLayoutOfValue(r); - if (!layout) - layout = resultToLayouts[r]; - if (!layout) { - LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: " - << r << "\n"); - continue; - } - if (auto tensorDescTy = dyn_cast(resultType)) { - auto newTdescTy = xegpu::TensorDescType::get( - tensorDescTy.getContext(), tensorDescTy.getShape(), - tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - r.setType(newTdescTy); - continue; - } - // If the result is a vector type, add a temporary layout attribute to the - // op. - std::string resultLayoutName = - resultLayoutNamePrefix + std::to_string(r.getResultNumber()); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - for (OpOperand &user : r.getUses()) { - Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Add temporary layout attribute at the user op. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); - owner->setAttr(attrName, layout); + // if (resultToLayouts.count(result) != 0 && + // resultToLayouts[result] != blockArgLayout) { + // LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result + // << " - " << resultToLayouts[result] << " vs " + // << blockArgLayout << "\n"); + // } else { + // resultToLayouts[result] = blockArgLayout; + // } } } + // for (auto [i, r] : llvm::enumerate(op->getResults())) { + // Type resultType = r.getType(); + // if (!isa(resultType)) + // continue; + // xegpu::LayoutAttr layout = getLayoutOfValue(r); + // if (!layout) + // layout = resultToLayouts[r]; + // if (!layout) { + // LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: + // " + // << r << "\n"); + // continue; + // } + // if (auto tensorDescTy = dyn_cast(resultType)) { + // auto newTdescTy = xegpu::TensorDescType::get( + // tensorDescTy.getContext(), tensorDescTy.getShape(), + // tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + // r.setType(newTdescTy); + // continue; + // } + // // If the result is a vector type, add a temporary layout attribute to + // the + // // op. + // std::string resultLayoutName = + // resultLayoutNamePrefix + std::to_string(r.getResultNumber()); + // op->setAttr(resultLayoutName, layout); + // // Update all users of the result with the layout. + // for (OpOperand &user : r.getUses()) { + // Operation *owner = user.getOwner(); + // unsigned operandNumber = user.getOperandNumber(); + // // Add temporary layout attribute at the user op. + // std::string attrName = + // operandLayoutNamePrefix + std::to_string(operandNumber); + // owner->setAttr(attrName, layout); + // } + // } } static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block, GetLayoutCallbackFnTy getLayoutOfValue) {} @@ -1846,90 +1894,96 @@ void XeGPUSubgroupDistributePass::runOnOperation() { updateBlockTypes(builder, *block, getXeGPULayoutForValue); }); - // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 - // operation. - { - RewritePatternSet patterns(&getContext()); - patterns.add(&getContext()); - - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - signalPassFailure(); - return; - } - // At this point, we have moved the entire function body inside the - // warpOp. Now move any scalar uniform code outside of the warpOp (like GPU - // index ops, scalar constants, etc.). This will simplify the later lowering - // and avoid custom patterns for these ops. - getOperation()->walk([&](Operation *op) { - if (auto warpOp = dyn_cast(op)) { - vector::moveScalarUniformCode(warpOp); - } - }); - } - // Finally, do the SIMD to SIMT distribution. - RewritePatternSet patterns(&getContext()); - xegpu::populateXeGPUSubgroupDistributePatterns(patterns); - // TODO: distributionFn and shuffleFn are not used at this point. - auto distributionFn = [](Value val) { - VectorType vecType = dyn_cast(val.getType()); - int64_t vecRank = vecType ? vecType.getRank() : 0; - OpBuilder builder(val.getContext()); - if (vecRank == 0) - return AffineMap::get(val.getContext()); - return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); - }; - auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, - int64_t warpSz) { return Value(); }; - vector::populatePropagateWarpVectorDistributionPatterns( - patterns, distributionFn, shuffleFn); - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - signalPassFailure(); - return; - } - - // Clean up UnrealizedConversionCastOps that were inserted due to tensor desc - // type mismatches created by using upstream distribution patterns (scf.for) - getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { - // We are only interested in UnrealizedConversionCastOps there were added - // for resolving SIMT type mismatches. - if (!op->getAttr(resolveSIMTTypeMismatch)) - return WalkResult::skip(); - - Value input = op.getOperand(0); - Value output = op.getResult(0); - - // Both input and output must have tensor descriptor types. - xegpu::TensorDescType inputDescType = - mlir::dyn_cast(input.getType()); - xegpu::TensorDescType outputDescType = - mlir::dyn_cast(output.getType()); - assert(inputDescType && outputDescType && - "Unrealized conversion cast must have tensor descriptor types"); - - // tensor_desc -> tensor_desc Type of conversions. - // This occurs iside scf.for body to resolve the block argument type to SIMT - // type. - if (inputDescType.getLayout()) { - auto argument = mlir::dyn_cast(input); - if (argument) { - argument.setType(output.getType()); - output.replaceAllUsesWith(argument); - if (auto loopOp = mlir::dyn_cast( - argument.getOwner()->getParentOp())) { - auto result = loopOp.getTiedLoopResult(argument); - result.setType(output.getType()); - } - } - } - - // tensor_desc -> tensor_desc Type of - // conversions. This occurs at the yield op of scf.for body to go back from - // SIMT type to original type. - if (outputDescType.getLayout()) - output.replaceAllUsesWith(input); - - if (op->use_empty()) - op->erase(); - return WalkResult::advance(); - }); + // // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 + // // operation. + // { + // RewritePatternSet patterns(&getContext()); + // patterns.add(&getContext()); + + // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + // signalPassFailure(); + // return; + // } + // // At this point, we have moved the entire function body inside the + // // warpOp. Now move any scalar uniform code outside of the warpOp (like + // GPU + // // index ops, scalar constants, etc.). This will simplify the later + // lowering + // // and avoid custom patterns for these ops. + // getOperation()->walk([&](Operation *op) { + // if (auto warpOp = dyn_cast(op)) { + // vector::moveScalarUniformCode(warpOp); + // } + // }); + // } + // // Finally, do the SIMD to SIMT distribution. + // RewritePatternSet patterns(&getContext()); + // xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + // // TODO: distributionFn and shuffleFn are not used at this point. + // auto distributionFn = [](Value val) { + // VectorType vecType = dyn_cast(val.getType()); + // int64_t vecRank = vecType ? vecType.getRank() : 0; + // OpBuilder builder(val.getContext()); + // if (vecRank == 0) + // return AffineMap::get(val.getContext()); + // return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); + // }; + // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value + // srcIdx, + // int64_t warpSz) { return Value(); }; + // vector::populatePropagateWarpVectorDistributionPatterns( + // patterns, distributionFn, shuffleFn); + // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + // signalPassFailure(); + // return; + // } + + // // Clean up UnrealizedConversionCastOps that were inserted due to tensor + // desc + // // type mismatches created by using upstream distribution patterns + // (scf.for) getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { + // // We are only interested in UnrealizedConversionCastOps there were added + // // for resolving SIMT type mismatches. + // if (!op->getAttr(resolveSIMTTypeMismatch)) + // return WalkResult::skip(); + + // Value input = op.getOperand(0); + // Value output = op.getResult(0); + + // // Both input and output must have tensor descriptor types. + // xegpu::TensorDescType inputDescType = + // mlir::dyn_cast(input.getType()); + // xegpu::TensorDescType outputDescType = + // mlir::dyn_cast(output.getType()); + // assert(inputDescType && outputDescType && + // "Unrealized conversion cast must have tensor descriptor types"); + + // // tensor_desc -> tensor_desc Type of conversions. + // // This occurs iside scf.for body to resolve the block argument type to + // SIMT + // // type. + // if (inputDescType.getLayout()) { + // auto argument = mlir::dyn_cast(input); + // if (argument) { + // argument.setType(output.getType()); + // output.replaceAllUsesWith(argument); + // if (auto loopOp = mlir::dyn_cast( + // argument.getOwner()->getParentOp())) { + // auto result = loopOp.getTiedLoopResult(argument); + // result.setType(output.getType()); + // } + // } + // } + + // // tensor_desc -> tensor_desc Type of + // // conversions. This occurs at the yield op of scf.for body to go back + // from + // // SIMT type to original type. + // if (outputDescType.getLayout()) + // output.replaceAllUsesWith(input); + + // if (op->use_empty()) + // op->erase(); + // return WalkResult::advance(); + // }); } From 7bd0be22d02e14f2ca4c5530b8a14e6b18781803 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 4 Jun 2025 15:17:27 +0000 Subject: [PATCH 11/44] save work --- .../Transforms/XeGPUSubgroupDistribute.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 27d912b87c6dc..b997af37a072b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -938,18 +938,18 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - std::string resultLayoutName = - resultLayoutNamePrefix + std::to_string(result.getResultNumber()); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - for (OpOperand &user : result.getUses()) { - Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Add temorary layout attribute at the user op. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); - owner->setAttr(attrName, layout); - } + // std::string resultLayoutName = + // resultLayoutNamePrefix + std::to_string(result.getResultNumber()); + // op->setAttr(resultLayoutName, layout); + // // Update all users of the result with the layout. + // for (OpOperand &user : result.getUses()) { + // Operation *owner = user.getOwner(); + // unsigned operandNumber = user.getOperandNumber(); + // // Add temorary layout attribute at the user op. + // std::string attrName = + // operandLayoutNamePrefix + std::to_string(operandNumber); + // owner->setAttr(attrName, layout); + // } } } static void updateBranchTerminatorOpInterface( From 00dc2b67a925ac79d9dc6bee5bf4a167217304eb Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 4 Jun 2025 22:51:37 +0000 Subject: [PATCH 12/44] working --- .../Transforms/XeGPUSubgroupDistribute.cpp | 295 +++++++++--------- .../Dialect/XeGPU/subgroup-distribution.mlir | 98 +++--- 2 files changed, 195 insertions(+), 198 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index b997af37a072b..a17c8d8a4f3f3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -938,18 +938,18 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - // std::string resultLayoutName = - // resultLayoutNamePrefix + std::to_string(result.getResultNumber()); - // op->setAttr(resultLayoutName, layout); - // // Update all users of the result with the layout. - // for (OpOperand &user : result.getUses()) { - // Operation *owner = user.getOwner(); - // unsigned operandNumber = user.getOperandNumber(); - // // Add temorary layout attribute at the user op. - // std::string attrName = - // operandLayoutNamePrefix + std::to_string(operandNumber); - // owner->setAttr(attrName, layout); - // } + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(result.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : result.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temorary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } } } static void updateBranchTerminatorOpInterface( @@ -992,8 +992,6 @@ static void updateBranchTerminatorOpInterface( << inputLayout << " vs " << operandLayout << "\n"); continue; } - llvm::errs() << "Setting layout for input to " - << ": " << operandLayout << "\n"; // Get tensor descriptor type with the layout. auto tdescTy = dyn_cast(inputType); auto newTdescTy = xegpu::TensorDescType::get( @@ -1044,55 +1042,51 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, tdescTy.getEncoding(), blockArgLayout); input.setType(newTdescTy); // Store the layout for the result. - // if (resultToLayouts.count(result) != 0 && - // resultToLayouts[result] != blockArgLayout) { - // LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result - // << " - " << resultToLayouts[result] << " vs " - // << blockArgLayout << "\n"); - // } else { - // resultToLayouts[result] = blockArgLayout; - // } + if (resultToLayouts.count(result) != 0 && + resultToLayouts[result] != blockArgLayout) { + LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result + << " - " << resultToLayouts[result] << " vs " + << blockArgLayout << "\n"); + } else { + resultToLayouts[result] = blockArgLayout; + } + } + } + for (auto [i, r] : llvm::enumerate(op->getResults())) { + Type resultType = r.getType(); + if (!isa(resultType)) + continue; + xegpu::LayoutAttr layout = getLayoutOfValue(r); + if (!layout) + layout = resultToLayouts[r]; + if (!layout) { + LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result:" + << r << "\n"); + continue; + } + if (auto tensorDescTy = dyn_cast(resultType)) { + auto newTdescTy = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + r.setType(newTdescTy); + continue; + } + // If the result is a vector type, add a temporary layout attribute to + // the op. + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(r.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : r.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temporary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); } } - // for (auto [i, r] : llvm::enumerate(op->getResults())) { - // Type resultType = r.getType(); - // if (!isa(resultType)) - // continue; - // xegpu::LayoutAttr layout = getLayoutOfValue(r); - // if (!layout) - // layout = resultToLayouts[r]; - // if (!layout) { - // LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: - // " - // << r << "\n"); - // continue; - // } - // if (auto tensorDescTy = dyn_cast(resultType)) { - // auto newTdescTy = xegpu::TensorDescType::get( - // tensorDescTy.getContext(), tensorDescTy.getShape(), - // tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - // r.setType(newTdescTy); - // continue; - // } - // // If the result is a vector type, add a temporary layout attribute to - // the - // // op. - // std::string resultLayoutName = - // resultLayoutNamePrefix + std::to_string(r.getResultNumber()); - // op->setAttr(resultLayoutName, layout); - // // Update all users of the result with the layout. - // for (OpOperand &user : r.getUses()) { - // Operation *owner = user.getOwner(); - // unsigned operandNumber = user.getOperandNumber(); - // // Add temporary layout attribute at the user op. - // std::string attrName = - // operandLayoutNamePrefix + std::to_string(operandNumber); - // owner->setAttr(attrName, layout); - // } - // } } -static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block, - GetLayoutCallbackFnTy getLayoutOfValue) {} namespace { @@ -1890,100 +1884,93 @@ void XeGPUSubgroupDistributePass::runOnOperation() { } updateOp(builder, &op, getXeGPULayoutForValue); } - - updateBlockTypes(builder, *block, getXeGPULayoutForValue); }); - // // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 - // // operation. - // { - // RewritePatternSet patterns(&getContext()); - // patterns.add(&getContext()); - - // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - // signalPassFailure(); - // return; - // } - // // At this point, we have moved the entire function body inside the - // // warpOp. Now move any scalar uniform code outside of the warpOp (like - // GPU - // // index ops, scalar constants, etc.). This will simplify the later - // lowering - // // and avoid custom patterns for these ops. - // getOperation()->walk([&](Operation *op) { - // if (auto warpOp = dyn_cast(op)) { - // vector::moveScalarUniformCode(warpOp); - // } - // }); - // } - // // Finally, do the SIMD to SIMT distribution. - // RewritePatternSet patterns(&getContext()); - // xegpu::populateXeGPUSubgroupDistributePatterns(patterns); - // // TODO: distributionFn and shuffleFn are not used at this point. - // auto distributionFn = [](Value val) { - // VectorType vecType = dyn_cast(val.getType()); - // int64_t vecRank = vecType ? vecType.getRank() : 0; - // OpBuilder builder(val.getContext()); - // if (vecRank == 0) - // return AffineMap::get(val.getContext()); - // return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); - // }; - // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value - // srcIdx, - // int64_t warpSz) { return Value(); }; - // vector::populatePropagateWarpVectorDistributionPatterns( - // patterns, distributionFn, shuffleFn); - // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - // signalPassFailure(); - // return; - // } - - // // Clean up UnrealizedConversionCastOps that were inserted due to tensor - // desc - // // type mismatches created by using upstream distribution patterns - // (scf.for) getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { - // // We are only interested in UnrealizedConversionCastOps there were added - // // for resolving SIMT type mismatches. - // if (!op->getAttr(resolveSIMTTypeMismatch)) - // return WalkResult::skip(); - - // Value input = op.getOperand(0); - // Value output = op.getResult(0); - - // // Both input and output must have tensor descriptor types. - // xegpu::TensorDescType inputDescType = - // mlir::dyn_cast(input.getType()); - // xegpu::TensorDescType outputDescType = - // mlir::dyn_cast(output.getType()); - // assert(inputDescType && outputDescType && - // "Unrealized conversion cast must have tensor descriptor types"); - - // // tensor_desc -> tensor_desc Type of conversions. - // // This occurs iside scf.for body to resolve the block argument type to - // SIMT - // // type. - // if (inputDescType.getLayout()) { - // auto argument = mlir::dyn_cast(input); - // if (argument) { - // argument.setType(output.getType()); - // output.replaceAllUsesWith(argument); - // if (auto loopOp = mlir::dyn_cast( - // argument.getOwner()->getParentOp())) { - // auto result = loopOp.getTiedLoopResult(argument); - // result.setType(output.getType()); - // } - // } - // } - - // // tensor_desc -> tensor_desc Type of - // // conversions. This occurs at the yield op of scf.for body to go back - // from - // // SIMT type to original type. - // if (outputDescType.getLayout()) - // output.replaceAllUsesWith(input); - - // if (op->use_empty()) - // op->erase(); - // return WalkResult::advance(); - // }); + // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 + // operation. + { + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + return; + } + // At this point, we have moved the entire function body inside the + // warpOp. Now move any scalar uniform code outside of the warpOp (like + // GPU index ops, scalar constants, etc.). This will simplify the + // later lowering and avoid custom patterns for these ops. + getOperation()->walk([&](Operation *op) { + if (auto warpOp = dyn_cast(op)) { + vector::moveScalarUniformCode(warpOp); + } + }); + } + // Finally, do the SIMD to SIMT distribution. + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + // TODO: distributionFn and shuffleFn are not used at this point. + auto distributionFn = [](Value val) { + VectorType vecType = dyn_cast(val.getType()); + int64_t vecRank = vecType ? vecType.getRank() : 0; + OpBuilder builder(val.getContext()); + if (vecRank == 0) + return AffineMap::get(val.getContext()); + return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext()); + }; + auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx, + int64_t warpSz) { return Value(); }; + vector::populatePropagateWarpVectorDistributionPatterns( + patterns, distributionFn, shuffleFn); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { + signalPassFailure(); + return; + } + + // Clean up UnrealizedConversionCastOps that were inserted due to tensor + // desc type mismatches created by using upstream distribution patterns + // (scf.for) + getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { + // We are only interested in UnrealizedConversionCastOps there were added + // for resolving SIMT type mismatches. + if (!op->getAttr(resolveSIMTTypeMismatch)) + return WalkResult::skip(); + + Value input = op.getOperand(0); + Value output = op.getResult(0); + + // Both input and output must have tensor descriptor types. + xegpu::TensorDescType inputDescType = + mlir::dyn_cast(input.getType()); + xegpu::TensorDescType outputDescType = + mlir::dyn_cast(output.getType()); + assert(inputDescType && outputDescType && + "Unrealized conversion cast must have tensor descriptor types"); + + // tensor_desc -> tensor_desc Type of conversions. + // This occurs iside scf.for body to resolve the block argument type to + // SIMT type. + if (inputDescType.getLayout()) { + auto argument = mlir::dyn_cast(input); + if (argument) { + argument.setType(output.getType()); + output.replaceAllUsesWith(argument); + if (auto loopOp = mlir::dyn_cast( + argument.getOwner()->getParentOp())) { + auto result = loopOp.getTiedLoopResult(argument); + result.setType(output.getType()); + } + } + } + + // tensor_desc -> tensor_desc Type of + // conversions. This occurs at the yield op of scf.for body to go back + // from SIMT type to original type. + if (outputDescType.getLayout()) + output.replaceAllUsesWith(input); + + if (op->use_empty()) + op->erase(); + return WalkResult::advance(); + }); } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir index e5606c5642505..b5f6bda26d830 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir @@ -93,49 +93,54 @@ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16 } // ----- -// CHECK-LABEL: gpu.func @dpas -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]] -// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) { -// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>): -// CHECK: gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -// CHECK: } -// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16> -// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32> -// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> -// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T5]], %[[T6]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-LABEL: gpu.func @load_dpas_store +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){ +gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ %c0 = arith.constant 0 : index - %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } + // ----- -// CHECK-LABEL: gpu.func @load_dpas_store +// CHECK-LABEL: gpu.func @load_dpas_postop_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> +// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ +gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %5 = math.exp %4 : vector<8x16xf32> + %6 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } @@ -169,20 +174,22 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, // CHECK: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index // CHECK: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// CHECK: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// CHECK: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> -// CHECK: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// CHECK: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> -// CHECK: scf.yield %[[T16]] : vector<8x1xf32> -// CHECK: } -// CHECK: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK-DAG: %[[C_INIT:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// CHECK-DAG: %[[B_TILE:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}, %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[A_TILE:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %{{.*}}] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK: %[[T7:.*]]:3 = scf.for {{.*}} iter_args(%[[C_VAL:.*]] = %[[C_INIT]], %[[A_ARG:.*]] = %[[A_TILE]], %[[B_ARG:.*]] = %[[B_TILE]]) -> (vector<8x1xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>) { +// CHECK-DAG: %[[B_NEXT:.*]] = xegpu.update_nd_offset %[[B_ARG]], [{{.*}}] : !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[A_NEXT:.*]] = xegpu.update_nd_offset %[[A_ARG]], [{{.*}}] : !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[B_ARG]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[A_ARG]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[C:.*]] = vector.shape_cast %[[C_VAL]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T8:.*]] = xegpu.dpas %[[A]], %[[B]], %[[C]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: %[[C_OUT:.*]] = vector.shape_cast %[[T8]] : vector<8xf32> to vector<8x1xf32> +// CHECK-NEXT: scf.yield %[[C_OUT]], %[[A_NEXT]], %[[B_NEXT]] : vector<8x1xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16> +// CHECK-NEXT:} +// CHECK-NEXT: %[[C_FINAL:.*]] = vector.shape_cast %[[T7]]#0 : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: xegpu.store_nd %[[C_FINAL]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index @@ -195,15 +202,18 @@ gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16> %3 = arith.muli %1, %c16 : index %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %6 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) { - %7 = xegpu.create_nd_tdesc %arg0[%2, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> - %8 = xegpu.create_nd_tdesc %arg1[%arg3, %3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> - %9 = xegpu.load_nd %7 : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16> - %10 = xegpu.load_nd %8 : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16> + %7 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> + %8 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> + %6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5, %arg5 = %7, %arg6 = %8) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>) { + %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16> + %10 = xegpu.load_nd %arg6 : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16> + %12 = xegpu.update_nd_offset %arg5, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16> + %13 = xegpu.update_nd_offset %arg6, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16> %11 = xegpu.dpas %9, %10, %arg4 : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield %11 : vector<8x16xf32> + scf.yield %11, %12, %13 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16> } - xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %12 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %6#0, %12 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } From 35620ec131462b97239409a984d792455289a32e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 15:39:43 +0000 Subject: [PATCH 13/44] move out layout prop --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 12 + .../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 + .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 920 ++++++++++++++ .../Transforms/XeGPUSubgroupDistribute.cpp | 1052 ----------------- 4 files changed, 933 insertions(+), 1052 deletions(-) create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 6f585f9ceb29b..08e02f295a851 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -33,6 +33,18 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { "Print the result of the subgroup map propagation analysis and exit.">]; } +def XeGPULayoutPropagate : Pass<"xegpu-layout-propagate"> { + let summary = "Propagate XeGPU layout information"; + let description = [{ + This pass propagates the XeGPU layout information accross ops. Starting + from a set of anchor operations (e.g. `dpas`, `store_nd`), this will + propagate the layouts required for operands and results to the producers or + consumers. + }]; + let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", + "vector::VectorDialect"]; +} + def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { let summary = "Transform WorkGroup level XeGPU code to SubGroup level"; let description = [{ diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 7d9b5584b0b2b..a72be9cd60b9c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUSubgroupDistribute.cpp XeGPUUnroll.cpp XeGPUWgToSgDistribute.cpp + XeGPULayoutPropagate.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp new file mode 100644 index 0000000000000..f308d338b511a --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -0,0 +1,920 @@ +//===- XeGPULayoutPropagate.cpp - XeGPU Layout Propagation ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/SparseAnalysis.h" +#include "mlir/Analysis/DataFlowFramework.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InterleavedRange.h" +#include "llvm/Support/raw_ostream.h" + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPULAYOUTPROPAGATE +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-layout-propagate" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; +using namespace mlir::dataflow; + +/// HW dependent constants. +/// TODO: These constants should be queried from the target information. +constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. +/// If DPAS A or B operands have low precision element types they must be packed +/// according to the following sizes. +constexpr unsigned packedSizeInBitsForDefault = + 16; // Minimum packing size per register for DPAS A. +constexpr unsigned packedSizeInBitsForDpasB = + 32; // Minimum packing size per register for DPAS B. +static const char *const operandLayoutNamePrefix = + "layout_operand_"; // Attribute name for identifying operand layouts. +static const char *const resultLayoutNamePrefix = + "layout_result_"; // Attribute name for identifying result layouts. +static const char *const resolveSIMTTypeMismatch = + "resolve_simt_type_mismatch"; // Attribute name for identifying + // UnrelizedConversionCastOp added to resolve + // SIMT type mismatches. + +namespace { + +//===----------------------------------------------------------------------===// +// Layout +//===----------------------------------------------------------------------===// + +/// Helper class to store the ND layout of lanes within a subgroup and data +/// owned by each lane. +struct Layout { + SmallVector layout; + Layout() = default; + Layout(std::initializer_list list) : layout(list) {} + void print(llvm::raw_ostream &os) const; + size_t size() const { return layout.size(); } + int64_t operator[](size_t idx) const; +}; + +void Layout::print(llvm::raw_ostream &os) const { + os << llvm::interleaved_array(layout); +} + +int64_t Layout::operator[](size_t idx) const { + assert(idx < layout.size() && "Index out of bounds."); + return layout[idx]; +} + +/// LaneLayout represents the logical layout of lanes within a subgroup when it +/// accesses some value. LaneData represents the logical layout of data owned by +/// each work item. +using LaneLayout = Layout; +using LaneData = Layout; + +//===----------------------------------------------------------------------===// +// LayoutInfo +//===----------------------------------------------------------------------===// + +/// Helper class for tracking the analysis state of an mlir value. For layout +/// propagation, the analysis state is simply the lane_layout and lane_data of +/// each value. Purpose of this analysis to propagate some unique layout for +/// each value in the program starting from a set of anchor operations (like +/// DPAS, StoreNd, etc.). +/// +/// Given this, LayoutInfo satisifies the following properties: +/// 1) A LayoutInfo value can be in one of two states - `assigned` or `not +/// assigned`. +/// 2) Two LayoutInfo values are equal if they are both assigned or +/// both not assigned. The concrete value of assigned state does not matter. +/// 3) The meet operator works as follows: +/// - If current state is assigned, return the current state. (already +/// a unique layout is assigned. don't change it) +/// - Otherwise, return the other state. + +struct LayoutInfo { +private: + LaneLayout laneLayout; + LaneData laneData; + +public: + LayoutInfo() = default; + LayoutInfo(const LaneLayout &layout, const LaneData &data) + : laneLayout(layout), laneData(data) {} + + // Two lattice values are equal if they have `some` layout. The actual + // content of the layout does not matter. + bool operator==(const LayoutInfo &other) const { + return this->isAssigned() == other.isAssigned(); + } + + static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs); + + static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs); + + void print(raw_ostream &os) const; + + bool isAssigned() const { + return laneLayout.size() > 0 && laneData.size() > 0; + } + + LayoutInfo getTransposedLayout(ArrayRef permutation) const; + + const LaneLayout &getLayout() const { return laneLayout; } + const LaneData &getData() const { return laneData; } + ArrayRef getLayoutAsArrayRef() const { return laneLayout.layout; } + ArrayRef getDataAsArrayRef() const { return laneData.layout; } +}; + +void LayoutInfo::print(raw_ostream &os) const { + if (isAssigned()) { + os << "lane_layout: "; + laneLayout.print(os); + os << ", lane_data: "; + laneData.print(os); + } else { + os << "Not assigned."; + } +} + +LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) { + if (!lhs.isAssigned()) + return rhs; + return lhs; +} + +/// Since this is a backward analysis, join method is not used. +LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { + llvm_unreachable("Join should not be triggered by layout propagation."); +} + +/// Get the transposed layout according to the given permutation. +LayoutInfo +LayoutInfo::getTransposedLayout(ArrayRef permutation) const { + if (!isAssigned()) + return {}; + LaneLayout newLayout; + LaneData newData; + for (int64_t idx : permutation) { + newLayout.layout.push_back(laneLayout.layout[idx]); + newData.layout.push_back(laneData.layout[idx]); + } + return LayoutInfo(newLayout, newData); +} + +//===----------------------------------------------------------------------===// +// LayoutInfoLattice +//===----------------------------------------------------------------------===// + +/// Lattice holding the LayoutInfo for each value. +struct LayoutInfoLattice : public Lattice { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice) + using Lattice::Lattice; +}; + +/// Helper Functions to get default layouts. A `default layout` is a layout that +/// is assigned to a value when the layout is not fixed by some anchor operation +/// (like DPAS). + +/// Helper Function to get the default layout for uniform values like constants. +/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. +/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. +static LayoutInfo getDefaultLayoutInfo(unsigned rank) { + assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); + if (rank == 1) + return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1})); + return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1})); +} + +/// Helper to get the default layout for a vector type. +static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { + // Expecting a 1D or 2D vector. + assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && + "Expected 1D or 2D vector."); + // Expecting int or float element type. + assert(vectorTy.getElementType().isIntOrFloat() && + "Expected int or float element type."); + // If the rank is 1, then return default layout for 1D vector. + if (vectorTy.getRank() == 1) + return getDefaultLayoutInfo(1); + // Packing factor is determined by the element type bitwidth. + int packingFactor = 1; + unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); + if (bitwidth < packedSizeInBitsForDefault) + packingFactor = packedSizeInBitsForDefault / bitwidth; + return LayoutInfo(LaneLayout({1, subgroupSize}), + LaneData({1, packingFactor})); +} + +/// Helper Function to get the expected layouts for DPAS operands. `lane_data` +/// is set according to the following criteria: +/// * For A operand, the data must be packed in minimum +/// `packedSizeInBitsForDefault` +/// * For B operand, the data must be packed in minimum +/// `packedSizeInBitsForDpasB` +static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, + unsigned operandNum) { + Type elementTy = vectorTy.getElementType(); + assert(elementTy.isIntOrFloat() && + "Expected int or float type in DPAS operands"); + LaneLayout layout({1, subgroupSize}); + // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and + // must have the VNNI format. + if (operandNum == 1 && + elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) { + LaneData data( + {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1}); + return LayoutInfo(layout, data); + } + // Otherwise, return the default layout for the vector type. + return getDefaultLayoutInfo(vectorTy); +} + +//===----------------------------------------------------------------------===// +// LayoutInfoPropagation +//===----------------------------------------------------------------------===// + +/// Backward data flow analysis to propagate the lane_layout and lane_data of +/// each value in the program. Currently, the layouts for operands DPAS, +/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of +/// this analysis is to propagate those known layouts to all their producers and +/// (other) consumers. +class LayoutInfoPropagation + : public SparseBackwardDataFlowAnalysis { +private: + void visitDpasOp(xegpu::DpasOp dpas, ArrayRef operands, + ArrayRef results); + + void visitStoreNdOp(xegpu::StoreNdOp store, + ArrayRef operands, + ArrayRef results); + + void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter, + ArrayRef operands, + ArrayRef results); + + void visitLoadNdOp(xegpu::LoadNdOp load, + ArrayRef operands, + ArrayRef results); + + void visitLoadGatherOp(xegpu::LoadGatherOp load, + ArrayRef operands, + ArrayRef results); + + void visitTransposeOp(vector::TransposeOp transpose, + ArrayRef operands, + ArrayRef results); + + void visitVectorBitcastOp(vector::BitCastOp bitcast, + ArrayRef operands, + ArrayRef results); + + void visitCreateDescOp(xegpu::CreateDescOp createDesc, + ArrayRef operands, + ArrayRef results); + + void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset, + ArrayRef operands, + ArrayRef results); + + void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch, + ArrayRef operands, + ArrayRef results); + + void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction, + ArrayRef operands, + ArrayRef results); + +public: + LayoutInfoPropagation(DataFlowSolver &solver, + SymbolTableCollection &symbolTable) + : SparseBackwardDataFlowAnalysis(solver, symbolTable) {} + using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; + + LogicalResult + visitOperation(Operation *op, ArrayRef operands, + ArrayRef results) override; + + void visitBranchOperand(OpOperand &operand) override {}; + + void visitCallOperand(OpOperand &operand) override {}; + + void visitExternalCall(CallOpInterface call, + ArrayRef operands, + ArrayRef results) override { + }; + + void setToExitState(LayoutInfoLattice *lattice) override { + (void)lattice->meet(LayoutInfo()); + } +}; +} // namespace + +LogicalResult LayoutInfoPropagation::visitOperation( + Operation *op, ArrayRef operands, + ArrayRef results) { + TypeSwitch(op) + .Case( + [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); }) + .Case( + [&](auto storeNdOp) { visitStoreNdOp(storeNdOp, operands, results); }) + .Case([&](auto storeScatterOp) { + visitStoreScatterOp(storeScatterOp, operands, results); + }) + .Case( + [&](auto loadNdOp) { visitLoadNdOp(loadNdOp, operands, results); }) + .Case([&](auto loadGatherOp) { + visitLoadGatherOp(loadGatherOp, operands, results); + }) + .Case([&](auto createDescOp) { + visitCreateDescOp(createDescOp, operands, results); + }) + .Case([&](auto updateNdOffsetOp) { + visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results); + }) + .Case([&](auto prefetchNdOp) { + visitPrefetchNdOp(prefetchNdOp, operands, results); + }) + // No need to propagate the layout to operands in CreateNdDescOp because + // they are scalars (offsets, sizes, etc.). + .Case([&](auto createNdDescOp) {}) + .Case([&](auto transposeOp) { + visitTransposeOp(transposeOp, operands, results); + }) + .Case([&](auto bitcastOp) { + visitVectorBitcastOp(bitcastOp, operands, results); + }) + .Case([&](auto reductionOp) { + visitVectorMultiReductionOp(reductionOp, operands, results); + }) + // All other ops. + .Default([&](Operation *op) { + for (const LayoutInfoLattice *r : results) { + for (LayoutInfoLattice *operand : operands) { + // Propagate the layout of the result to the operand. + if (r->getValue().isAssigned()) + meet(operand, *r); + } + } + }); + // Add a dependency from each result to program point after the operation. + for (const LayoutInfoLattice *r : results) { + addDependency(const_cast(r), getProgramPointAfter(op)); + } + return success(); +} + +void LayoutInfoPropagation::visitPrefetchNdOp( + xegpu::PrefetchNdOp prefetch, ArrayRef operands, + ArrayRef results) { + // Here we assign the default layout to the tensor descriptor operand of + // prefetch. + auto tdescTy = prefetch.getTensorDescType(); + auto prefetchLayout = getDefaultLayoutInfo( + VectorType::get(tdescTy.getShape(), tdescTy.getElementType())); + // Propagate the layout to the source tensor descriptor. + propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); +} + +void LayoutInfoPropagation::visitVectorMultiReductionOp( + vector::MultiDimReductionOp reduction, + ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + // We only consider 2D -> 1D reductions at this point. + assert(resultLayout.getLayout().size() == 1 && + "Expected 1D layout for reduction result."); + // Given that the result is 1D, the layout of the operand should be 2D with + // default layout. + LayoutInfo operandLayout = getDefaultLayoutInfo(2); + propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); + // Accumulator should have the same layout as the result. + propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); +} + +/// Propagate the layout of the result tensor to the source tensor descriptor in +/// UpdateNdOffsetOp. +void LayoutInfoPropagation::visitUpdateNdOffsetOp( + xegpu::UpdateNdOffsetOp updateNdOffset, + ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + // Propagate the layout to the source operand. + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); +} + +/// Set the layouts for DPAS A, B, and C operands. +void LayoutInfoPropagation::visitDpasOp( + xegpu::DpasOp dpas, ArrayRef operands, + ArrayRef results) { + VectorType aTy = dpas.getLhsType(); + VectorType bTy = dpas.getRhsType(); + propagateIfChanged(operands[0], + operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0))); + propagateIfChanged(operands[1], + operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1))); + if (operands.size() > 2) { + VectorType cTy = dpas.getAccType(); + propagateIfChanged(operands[2], + operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2))); + } +} + +/// Set the layout for the value and tensor descriptor operands in StoreNdOp. +void LayoutInfoPropagation::visitStoreNdOp( + xegpu::StoreNdOp store, ArrayRef operands, + ArrayRef results) { + LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType()); + // Both operands should have the same layout + for (LayoutInfoLattice *operand : operands) { + propagateIfChanged(operand, operand->meet(storeLayout)); + } +} + +/// Propagate the layout of the value to the tensor descriptor operand in +/// LoadNdOp. +void LayoutInfoPropagation::visitLoadNdOp( + xegpu::LoadNdOp load, ArrayRef operands, + ArrayRef results) { + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. + if (!valueLayout.isAssigned()) + return; + LayoutInfo tensorDescLayout = valueLayout; + // LoadNdOp has the transpose effect. However, at the stage of this analysis + // this effect is not expected and should be abstracted away. Emit a warning. + if (auto transpose = load.getTranspose()) { + load.emitWarning("Transpose effect is not expected for LoadNdOp at " + "LayoutInfoPropagation stage."); + tensorDescLayout = valueLayout.getTransposedLayout(transpose.value()); + } + // Propagate the new layout to the tensor descriptor operand. + propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); +} + +/// For vector::TransposeOp, the layout of the result is transposed and +/// propagated to the operand. +void LayoutInfoPropagation::visitTransposeOp( + vector::TransposeOp transpose, ArrayRef operands, + ArrayRef results) { + // Need the layout of transpose result to propagate to the operands. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + LayoutInfo newLayout = + resultLayout.getTransposedLayout(transpose.getPermutation()); + // Propagate the new layout to the vector operand. + propagateIfChanged(operands[0], operands[0]->meet(newLayout)); +} + +/// For vector::BitCastOp, the lane_data of the source layout is changed based +/// on the bit width of the source and result types. +void LayoutInfoPropagation::visitVectorBitcastOp( + vector::BitCastOp bitcast, ArrayRef operands, + ArrayRef results) { + // Need the layout of bitcast result to propagate to the operands. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + int inElemTyBitWidth = + bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth(); + int outElemTyBitWidth = + bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); + + // LaneLayout does not change. + const LaneLayout &newLaneLayout = resultLayout.getLayout(); + const LaneData &currData = resultLayout.getData(); + LaneData newLaneData; + // It's a widening bitcast + if (inElemTyBitWidth < outElemTyBitWidth) { + int ratio = outElemTyBitWidth / inElemTyBitWidth; + newLaneData = resultLayout.getData()[0] == 1 + ? LaneData({1, currData[1] * ratio}) + : LaneData({currData[0] * ratio, 1}); + } else { + // It's a narrowing bitcast + int ratio = inElemTyBitWidth / outElemTyBitWidth; + newLaneData = resultLayout.getData()[0] == 1 + ? LaneData({1, currData[1] / ratio}) + : LaneData({currData[0] / ratio, 1}); + } + + propagateIfChanged(operands[0], + operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData))); +} + +/// Propagate the layout of the result to the tensor descriptor and mask +/// operands in LoadGatherOp. +void LayoutInfoPropagation::visitLoadGatherOp( + xegpu::LoadGatherOp load, ArrayRef operands, + ArrayRef results) { + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. + if (!valueLayout.isAssigned()) + return; + + LayoutInfo tensorDescLayout = valueLayout; + if (load.getTranspose()) { + // LoadGatherOp has the transpose effect. However, at the stage of this + // analyis this effect is not expected and should be abstracted away. Emit + // a warning. + load.emitWarning("Transpose effect is not expected for LoadGatherOp at " + "LayoutInfoPropagation stage."); + tensorDescLayout = valueLayout.getTransposedLayout({1, 0}); + } + // Mask operand should have 1D default layout. + LayoutInfo maskLayout = getDefaultLayoutInfo(1); + // Propagate the new layout to the tensor descriptor operand. + propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); + // Propagate the new layout to the mask operand. + propagateIfChanged(operands[1], operands[1]->meet(maskLayout)); +} + +/// Propagate the layout of the descriptor to the vector offset operand in +/// CreateDescOp. +void LayoutInfoPropagation::visitCreateDescOp( + xegpu::CreateDescOp createDesc, ArrayRef operands, + ArrayRef results) { + LayoutInfo descLayout = results[0]->getValue(); + // Need the layout of the descriptor to propagate to the operands. + if (!descLayout.isAssigned()) + return; + // For offset operand propagate 1D default layout. + LayoutInfo layout = getDefaultLayoutInfo(1); + propagateIfChanged(operands[1], operands[1]->meet(layout)); +} + +/// Set the layout for the value, tensor descriptor, and mask operands in the +/// StoreScatterOp. +void LayoutInfoPropagation::visitStoreScatterOp( + xegpu::StoreScatterOp storeScatter, ArrayRef operands, + ArrayRef results) { + // Currently, for 2D StoreScatterOp we expect that the height dimension of + // the tensor descriptor is equal to the subgroup size. This is ensured by + // the op verifier. + ArrayRef tdescShape = storeScatter.getTensorDescType().getShape(); + if (tdescShape.size() > 1) + assert( + tdescShape[0] == subgroupSize && + "Expected the first dimension of 2D tensor descriptor to be equal to " + "subgroup size."); + + LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType()); + LayoutInfo storeScatterLayout = valueLayout; + if (storeScatter.getTranspose()) { + // StoreScatteOp allows transpose effect. However, at the stage of this + // analyis this effect is not expected and should be abstracted away. Emit + // a warning. + storeScatter.emitWarning("Transpose effect is not expected for " + "StoreScatterOp at LayoutInfoPropagation stage."); + storeScatterLayout = valueLayout.getTransposedLayout({1, 0}); + } + // Propagate the value layout. + propagateIfChanged(operands[0], operands[0]->meet(valueLayout)); + // Propagate the tensor descriptor layout. + propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout)); + // Use default 1D layout for mask operand. + LayoutInfo maskLayout = getDefaultLayoutInfo(1); + propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); +} + +namespace { + +//===----------------------------------------------------------------------===// +// RunLayoutInfoPropagation +//===----------------------------------------------------------------------===// + +/// Driver class for running the LayoutInfoPropagation analysis. +class RunLayoutInfoPropagation { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation) + + RunLayoutInfoPropagation(Operation *op) : target(op) { + SymbolTableCollection symbolTable; + solver.load(); + solver.load(); + solver.load(symbolTable); + (void)solver.initializeAndRun(op); + } + + LayoutInfo getLayoutInfo(Value val); + + void printAnalysisResult(llvm::raw_ostream &os); + +private: + DataFlowSolver solver; + const Operation *target; +}; +} // namespace + +LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) { + auto *state = solver.lookupState(val); + if (!state) + return {}; + return state->getValue(); +} + +// Print the analysis result for debugging purposes. +[[maybe_unused]] void +RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { + auto printFunctionResult = [&](FunctionOpInterface funcOp) { + os << "function: " << funcOp.getName() << ":\n"; + // Function arguments + for (BlockArgument arg : funcOp.getArguments()) { + LayoutInfo layout = getLayoutInfo(arg); + os << "argument: " << arg << "\n"; + os << "layout : "; + layout.print(os); + os << "\n"; + } + // Function ops + funcOp.walk([&](Operation *op) { + // Skip ops that do not have results + if (op->getResults().empty()) + return; + os << "op : "; + // For control-flow ops, print the op name only. + if (isa(op) || isa(op)) + os << op->getName(); + else + op->print(os); + os << "\n"; + // Print the layout for each result. + for (auto [i, r] : llvm::enumerate(op->getResults())) { + LayoutInfo layout = getLayoutInfo(r); + os << "layout for result #" << i << ": "; + layout.print(os); + os << "\n"; + } + }); + }; + + SmallVector funcOps; + if (auto modOp = dyn_cast(target)) { + for (auto funcOp : modOp.getOps()) { + funcOps.push_back(funcOp); + } + // Collect all GpuFuncOps in the module. + for (auto gpuModOp : modOp.getOps()) { + for (auto gpuFuncOp : gpuModOp.getOps()) { + funcOps.push_back(gpuFuncOp); + } + } + } + // Print the analysis result for each function. + for (FunctionOpInterface funcOp : funcOps) { + printFunctionResult(funcOp); + } +} + +using GetLayoutCallbackFnTy = function_ref; +static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, + GetLayoutCallbackFnTy getLayoutOfValue) { + + // Iterate over all the results. + for (OpResult result : op->getResults()) { + Type resultType = result.getType(); + // Layouts are needed only for vector and tensor descriptor types. + if (!isa(resultType)) + continue; + // If the result has any users, we expect it to have a layout. + xegpu::LayoutAttr layout = getLayoutOfValue(result); + if (!layout && result.getNumUses() > 0) { + LLVM_DEBUG(DBGS() << "Expecting layout for result: " << result + << " but got none.\n"); + continue; + } + if (auto tensorDescTy = dyn_cast(resultType)) { + // TODO: Handle error. + auto typeWithLayout = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + result.setType(typeWithLayout); + continue; + } + // If the result is a vector type, add a temporary layout attribute to the + // op. + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(result.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : result.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temorary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } + } +} +static void updateBranchTerminatorOpInterface( + mlir::OpBuilder &builder, + mlir::RegionBranchTerminatorOpInterface terminator, + GetLayoutCallbackFnTy getLayoutOfValue) { + if (!mlir::isa(terminator->getParentOp())) + return; + + llvm::SmallVector successors; + llvm::SmallVector operands(terminator->getNumOperands(), + nullptr); + terminator.getSuccessorRegions(operands, successors); + + for (mlir::RegionSuccessor &successor : successors) { + if (!successor.isParent()) + continue; + + mlir::OperandRange operands = terminator.getSuccessorOperands(successor); + mlir::ValueRange inputs = successor.getSuccessorInputs(); + for (auto [operand, input] : llvm::zip(operands, inputs)) { + // print arg and inp + // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n"; + Type inputType = input.getType(); + if (!isa(inputType)) + continue; + xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); + xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); + + if (!operandLayout) { + LLVM_DEBUG(DBGS() << "Expecting layout for region successor operand : " + << operand << " but got none.\n"); + continue; + } + + if (inputLayout && inputLayout != operandLayout) { + LLVM_DEBUG( + DBGS() + << "Conflicting layouts for region successor operand and input: " + << inputLayout << " vs " << operandLayout << "\n"); + continue; + } + // Get tensor descriptor type with the layout. + auto tdescTy = dyn_cast(inputType); + auto newTdescTy = xegpu::TensorDescType::get( + tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), + tdescTy.getEncoding(), operandLayout); + input.setType(newTdescTy); + } + } +} +static void updateBranchOpInterface(mlir::OpBuilder &builder, + mlir::RegionBranchOpInterface branch, + GetLayoutCallbackFnTy getLayoutOfValue) { + mlir::Operation *op = branch.getOperation(); + llvm::SmallVector successors; + llvm::SmallVector operands(op->getNumOperands(), nullptr); + branch.getEntrySuccessorRegions(operands, successors); + DenseMap resultToLayouts; + mlir::ValueRange results = op->getResults(); + + for (mlir::RegionSuccessor &successor : successors) { + if (successor.isParent()) + continue; + + mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor); + mlir::ValueRange inputs = successor.getSuccessorInputs(); + + for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) { + Type inputType = input.getType(); + if (!isa(inputType)) + continue; + xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(input); + xegpu::LayoutAttr initArgLayout = getLayoutOfValue(operand); + + if (!blockArgLayout || !initArgLayout) { + LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input + << " or init arg: " << operand << "\n"); + continue; + } + + // TOOD: We expect these two to match. Data flow analysis will ensure + // this. + assert(blockArgLayout == initArgLayout && + "Expexing block arg and init arg to have the same layout."); + // Get tensor descriptor type with the layout. + auto tdescTy = dyn_cast(inputType); + auto newTdescTy = xegpu::TensorDescType::get( + tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), + tdescTy.getEncoding(), blockArgLayout); + input.setType(newTdescTy); + // Store the layout for the result. + if (resultToLayouts.count(result) != 0 && + resultToLayouts[result] != blockArgLayout) { + LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result + << " - " << resultToLayouts[result] << " vs " + << blockArgLayout << "\n"); + } else { + resultToLayouts[result] = blockArgLayout; + } + } + } + for (auto [i, r] : llvm::enumerate(op->getResults())) { + Type resultType = r.getType(); + if (!isa(resultType)) + continue; + xegpu::LayoutAttr layout = getLayoutOfValue(r); + if (!layout) + layout = resultToLayouts[r]; + if (!layout) { + LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result:" + << r << "\n"); + continue; + } + if (auto tensorDescTy = dyn_cast(resultType)) { + auto newTdescTy = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + r.setType(newTdescTy); + continue; + } + // If the result is a vector type, add a temporary layout attribute to + // the op. + std::string resultLayoutName = + resultLayoutNamePrefix + std::to_string(r.getResultNumber()); + op->setAttr(resultLayoutName, layout); + // Update all users of the result with the layout. + for (OpOperand &user : r.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + // Add temporary layout attribute at the user op. + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } + } +} + +namespace { + +struct XeGPULayoutPropagatePass final + : public xegpu::impl::XeGPULayoutPropagateBase { + void runOnOperation() override; +}; + +} // namespace + +void XeGPULayoutPropagatePass::runOnOperation() { + auto &analyis = getAnalysis(); + + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { + LayoutInfo layout = analyis.getLayoutInfo(val); + if (!layout.isAssigned()) { + return {}; + } + SmallVector laneLayout, laneData; + for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), + layout.getDataAsArrayRef())) { + laneLayout.push_back(static_cast(layout)); + laneData.push_back(static_cast(data)); + } + return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData); + }; + + mlir::OpBuilder builder(&getContext()); + Operation *op = getOperation(); + op->walk([&](mlir::Block *block) { + for (mlir::Operation &op : llvm::reverse(block->getOperations())) { + if (auto terminator = + mlir::dyn_cast(op)) { + updateBranchTerminatorOpInterface(builder, terminator, + getXeGPULayoutForValue); + continue; + } + + if (auto iface = mlir::dyn_cast(op)) { + updateBranchOpInterface(builder, iface, getXeGPULayoutForValue); + continue; + } + updateOp(builder, &op, getXeGPULayoutForValue); + } + }); +} diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index a17c8d8a4f3f3..2df8701ed3b31 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -57,7 +57,6 @@ namespace xegpu { #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") using namespace mlir; -using namespace mlir::dataflow; /// HW dependent constants. /// TODO: These constants should be queried from the target information. @@ -79,1017 +78,6 @@ static const char *const resolveSIMTTypeMismatch = namespace { -//===----------------------------------------------------------------------===// -// Layout -//===----------------------------------------------------------------------===// - -/// Helper class to store the ND layout of lanes within a subgroup and data -/// owned by each lane. -struct Layout { - SmallVector layout; - Layout() = default; - Layout(std::initializer_list list) : layout(list) {} - void print(llvm::raw_ostream &os) const; - size_t size() const { return layout.size(); } - int64_t operator[](size_t idx) const; -}; - -void Layout::print(llvm::raw_ostream &os) const { - os << llvm::interleaved_array(layout); -} - -int64_t Layout::operator[](size_t idx) const { - assert(idx < layout.size() && "Index out of bounds."); - return layout[idx]; -} - -/// LaneLayout represents the logical layout of lanes within a subgroup when it -/// accesses some value. LaneData represents the logical layout of data owned by -/// each work item. -using LaneLayout = Layout; -using LaneData = Layout; - -//===----------------------------------------------------------------------===// -// LayoutInfo -//===----------------------------------------------------------------------===// - -/// Helper class for tracking the analysis state of an mlir value. For layout -/// propagation, the analysis state is simply the lane_layout and lane_data of -/// each value. Purpose of this analysis to propagate some unique layout for -/// each value in the program starting from a set of anchor operations (like -/// DPAS, StoreNd, etc.). -/// -/// Given this, LayoutInfo satisifies the following properties: -/// 1) A LayoutInfo value can be in one of two states - `assigned` or `not -/// assigned`. -/// 2) Two LayoutInfo values are equal if they are both assigned or -/// both not assigned. The concrete value of assigned state does not matter. -/// 3) The meet operator works as follows: -/// - If current state is assigned, return the current state. (already -/// a unique layout is assigned. don't change it) -/// - Otherwise, return the other state. - -struct LayoutInfo { -private: - LaneLayout laneLayout; - LaneData laneData; - -public: - LayoutInfo() = default; - LayoutInfo(const LaneLayout &layout, const LaneData &data) - : laneLayout(layout), laneData(data) {} - - // Two lattice values are equal if they have `some` layout. The actual - // content of the layout does not matter. - bool operator==(const LayoutInfo &other) const { - return this->isAssigned() == other.isAssigned(); - } - - static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs); - - static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs); - - void print(raw_ostream &os) const; - - bool isAssigned() const { - return laneLayout.size() > 0 && laneData.size() > 0; - } - - LayoutInfo getTransposedLayout(ArrayRef permutation) const; - - const LaneLayout &getLayout() const { return laneLayout; } - const LaneData &getData() const { return laneData; } - ArrayRef getLayoutAsArrayRef() const { return laneLayout.layout; } - ArrayRef getDataAsArrayRef() const { return laneData.layout; } -}; - -void LayoutInfo::print(raw_ostream &os) const { - if (isAssigned()) { - os << "lane_layout: "; - laneLayout.print(os); - os << ", lane_data: "; - laneData.print(os); - } else { - os << "Not assigned."; - } -} - -LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) { - if (!lhs.isAssigned()) - return rhs; - return lhs; -} - -/// Since this is a backward analysis, join method is not used. -LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { - llvm_unreachable("Join should not be triggered by layout propagation."); -} - -/// Get the transposed layout according to the given permutation. -LayoutInfo -LayoutInfo::getTransposedLayout(ArrayRef permutation) const { - if (!isAssigned()) - return {}; - LaneLayout newLayout; - LaneData newData; - for (int64_t idx : permutation) { - newLayout.layout.push_back(laneLayout.layout[idx]); - newData.layout.push_back(laneData.layout[idx]); - } - return LayoutInfo(newLayout, newData); -} - -//===----------------------------------------------------------------------===// -// LayoutInfoLattice -//===----------------------------------------------------------------------===// - -/// Lattice holding the LayoutInfo for each value. -struct LayoutInfoLattice : public Lattice { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice) - using Lattice::Lattice; -}; - -/// Helper Functions to get default layouts. A `default layout` is a layout that -/// is assigned to a value when the layout is not fixed by some anchor operation -/// (like DPAS). - -/// Helper Function to get the default layout for uniform values like constants. -/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. -/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. -static LayoutInfo getDefaultLayoutInfo(unsigned rank) { - assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); - if (rank == 1) - return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1})); - return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1})); -} - -/// Helper to get the default layout for a vector type. -static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { - // Expecting a 1D or 2D vector. - assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && - "Expected 1D or 2D vector."); - // Expecting int or float element type. - assert(vectorTy.getElementType().isIntOrFloat() && - "Expected int or float element type."); - // If the rank is 1, then return default layout for 1D vector. - if (vectorTy.getRank() == 1) - return getDefaultLayoutInfo(1); - // Packing factor is determined by the element type bitwidth. - int packingFactor = 1; - unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); - if (bitwidth < packedSizeInBitsForDefault) - packingFactor = packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(LaneLayout({1, subgroupSize}), - LaneData({1, packingFactor})); -} - -/// Helper Function to get the expected layouts for DPAS operands. `lane_data` -/// is set according to the following criteria: -/// * For A operand, the data must be packed in minimum -/// `packedSizeInBitsForDefault` -/// * For B operand, the data must be packed in minimum -/// `packedSizeInBitsForDpasB` -static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { - Type elementTy = vectorTy.getElementType(); - assert(elementTy.isIntOrFloat() && - "Expected int or float type in DPAS operands"); - LaneLayout layout({1, subgroupSize}); - // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and - // must have the VNNI format. - if (operandNum == 1 && - elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) { - LaneData data( - {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1}); - return LayoutInfo(layout, data); - } - // Otherwise, return the default layout for the vector type. - return getDefaultLayoutInfo(vectorTy); -} - -//===----------------------------------------------------------------------===// -// LayoutInfoPropagation -//===----------------------------------------------------------------------===// - -/// Backward data flow analysis to propagate the lane_layout and lane_data of -/// each value in the program. Currently, the layouts for operands DPAS, -/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of -/// this analysis is to propagate those known layouts to all their producers and -/// (other) consumers. -class LayoutInfoPropagation - : public SparseBackwardDataFlowAnalysis { -private: - void visitDpasOp(xegpu::DpasOp dpas, ArrayRef operands, - ArrayRef results); - - void visitStoreNdOp(xegpu::StoreNdOp store, - ArrayRef operands, - ArrayRef results); - - void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter, - ArrayRef operands, - ArrayRef results); - - void visitLoadNdOp(xegpu::LoadNdOp load, - ArrayRef operands, - ArrayRef results); - - void visitLoadGatherOp(xegpu::LoadGatherOp load, - ArrayRef operands, - ArrayRef results); - - void visitTransposeOp(vector::TransposeOp transpose, - ArrayRef operands, - ArrayRef results); - - void visitVectorBitcastOp(vector::BitCastOp bitcast, - ArrayRef operands, - ArrayRef results); - - void visitCreateDescOp(xegpu::CreateDescOp createDesc, - ArrayRef operands, - ArrayRef results); - - void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset, - ArrayRef operands, - ArrayRef results); - - void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch, - ArrayRef operands, - ArrayRef results); - - void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction, - ArrayRef operands, - ArrayRef results); - -public: - LayoutInfoPropagation(DataFlowSolver &solver, - SymbolTableCollection &symbolTable) - : SparseBackwardDataFlowAnalysis(solver, symbolTable) {} - using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; - - LogicalResult - visitOperation(Operation *op, ArrayRef operands, - ArrayRef results) override; - - void visitBranchOperand(OpOperand &operand) override {}; - - void visitCallOperand(OpOperand &operand) override {}; - - void visitExternalCall(CallOpInterface call, - ArrayRef operands, - ArrayRef results) override { - }; - - void setToExitState(LayoutInfoLattice *lattice) override { - (void)lattice->meet(LayoutInfo()); - } -}; -} // namespace - -LogicalResult LayoutInfoPropagation::visitOperation( - Operation *op, ArrayRef operands, - ArrayRef results) { - TypeSwitch(op) - .Case( - [&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); }) - .Case( - [&](auto storeNdOp) { visitStoreNdOp(storeNdOp, operands, results); }) - .Case([&](auto storeScatterOp) { - visitStoreScatterOp(storeScatterOp, operands, results); - }) - .Case( - [&](auto loadNdOp) { visitLoadNdOp(loadNdOp, operands, results); }) - .Case([&](auto loadGatherOp) { - visitLoadGatherOp(loadGatherOp, operands, results); - }) - .Case([&](auto createDescOp) { - visitCreateDescOp(createDescOp, operands, results); - }) - .Case([&](auto updateNdOffsetOp) { - visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results); - }) - .Case([&](auto prefetchNdOp) { - visitPrefetchNdOp(prefetchNdOp, operands, results); - }) - // No need to propagate the layout to operands in CreateNdDescOp because - // they are scalars (offsets, sizes, etc.). - .Case([&](auto createNdDescOp) {}) - .Case([&](auto transposeOp) { - visitTransposeOp(transposeOp, operands, results); - }) - .Case([&](auto bitcastOp) { - visitVectorBitcastOp(bitcastOp, operands, results); - }) - .Case([&](auto reductionOp) { - visitVectorMultiReductionOp(reductionOp, operands, results); - }) - // All other ops. - .Default([&](Operation *op) { - for (const LayoutInfoLattice *r : results) { - for (LayoutInfoLattice *operand : operands) { - // Propagate the layout of the result to the operand. - if (r->getValue().isAssigned()) - meet(operand, *r); - } - } - }); - // Add a dependency from each result to program point after the operation. - for (const LayoutInfoLattice *r : results) { - addDependency(const_cast(r), getProgramPointAfter(op)); - } - return success(); -} - -void LayoutInfoPropagation::visitPrefetchNdOp( - xegpu::PrefetchNdOp prefetch, ArrayRef operands, - ArrayRef results) { - // Here we assign the default layout to the tensor descriptor operand of - // prefetch. - auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultLayoutInfo( - VectorType::get(tdescTy.getShape(), tdescTy.getElementType())); - // Propagate the layout to the source tensor descriptor. - propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); -} - -void LayoutInfoPropagation::visitVectorMultiReductionOp( - vector::MultiDimReductionOp reduction, - ArrayRef operands, - ArrayRef results) { - // The layout of the result must be present. - LayoutInfo resultLayout = results[0]->getValue(); - if (!resultLayout.isAssigned()) - return; - // We only consider 2D -> 1D reductions at this point. - assert(resultLayout.getLayout().size() == 1 && - "Expected 1D layout for reduction result."); - // Given that the result is 1D, the layout of the operand should be 2D with - // default layout. - LayoutInfo operandLayout = getDefaultLayoutInfo(2); - propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); - // Accumulator should have the same layout as the result. - propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); -} - -/// Propagate the layout of the result tensor to the source tensor descriptor in -/// UpdateNdOffsetOp. -void LayoutInfoPropagation::visitUpdateNdOffsetOp( - xegpu::UpdateNdOffsetOp updateNdOffset, - ArrayRef operands, - ArrayRef results) { - // The layout of the result must be present. - LayoutInfo resultLayout = results[0]->getValue(); - if (!resultLayout.isAssigned()) - return; - // Propagate the layout to the source operand. - propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); -} - -/// Set the layouts for DPAS A, B, and C operands. -void LayoutInfoPropagation::visitDpasOp( - xegpu::DpasOp dpas, ArrayRef operands, - ArrayRef results) { - VectorType aTy = dpas.getLhsType(); - VectorType bTy = dpas.getRhsType(); - propagateIfChanged(operands[0], - operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged(operands[1], - operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1))); - if (operands.size() > 2) { - VectorType cTy = dpas.getAccType(); - propagateIfChanged(operands[2], - operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2))); - } -} - -/// Set the layout for the value and tensor descriptor operands in StoreNdOp. -void LayoutInfoPropagation::visitStoreNdOp( - xegpu::StoreNdOp store, ArrayRef operands, - ArrayRef results) { - LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType()); - // Both operands should have the same layout - for (LayoutInfoLattice *operand : operands) { - propagateIfChanged(operand, operand->meet(storeLayout)); - } -} - -/// Propagate the layout of the value to the tensor descriptor operand in -/// LoadNdOp. -void LayoutInfoPropagation::visitLoadNdOp( - xegpu::LoadNdOp load, ArrayRef operands, - ArrayRef results) { - LayoutInfo valueLayout = results[0]->getValue(); - // Need the layout of the value to propagate to the tensor descriptor. - if (!valueLayout.isAssigned()) - return; - LayoutInfo tensorDescLayout = valueLayout; - // LoadNdOp has the transpose effect. However, at the stage of this analysis - // this effect is not expected and should be abstracted away. Emit a warning. - if (auto transpose = load.getTranspose()) { - load.emitWarning("Transpose effect is not expected for LoadNdOp at " - "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.getTransposedLayout(transpose.value()); - } - // Propagate the new layout to the tensor descriptor operand. - propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); -} - -/// For vector::TransposeOp, the layout of the result is transposed and -/// propagated to the operand. -void LayoutInfoPropagation::visitTransposeOp( - vector::TransposeOp transpose, ArrayRef operands, - ArrayRef results) { - // Need the layout of transpose result to propagate to the operands. - LayoutInfo resultLayout = results[0]->getValue(); - if (!resultLayout.isAssigned()) - return; - LayoutInfo newLayout = - resultLayout.getTransposedLayout(transpose.getPermutation()); - // Propagate the new layout to the vector operand. - propagateIfChanged(operands[0], operands[0]->meet(newLayout)); -} - -/// For vector::BitCastOp, the lane_data of the source layout is changed based -/// on the bit width of the source and result types. -void LayoutInfoPropagation::visitVectorBitcastOp( - vector::BitCastOp bitcast, ArrayRef operands, - ArrayRef results) { - // Need the layout of bitcast result to propagate to the operands. - LayoutInfo resultLayout = results[0]->getValue(); - if (!resultLayout.isAssigned()) - return; - int inElemTyBitWidth = - bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth(); - int outElemTyBitWidth = - bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - - // LaneLayout does not change. - const LaneLayout &newLaneLayout = resultLayout.getLayout(); - const LaneData &currData = resultLayout.getData(); - LaneData newLaneData; - // It's a widening bitcast - if (inElemTyBitWidth < outElemTyBitWidth) { - int ratio = outElemTyBitWidth / inElemTyBitWidth; - newLaneData = resultLayout.getData()[0] == 1 - ? LaneData({1, currData[1] * ratio}) - : LaneData({currData[0] * ratio, 1}); - } else { - // It's a narrowing bitcast - int ratio = inElemTyBitWidth / outElemTyBitWidth; - newLaneData = resultLayout.getData()[0] == 1 - ? LaneData({1, currData[1] / ratio}) - : LaneData({currData[0] / ratio, 1}); - } - - propagateIfChanged(operands[0], - operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData))); -} - -/// Propagate the layout of the result to the tensor descriptor and mask -/// operands in LoadGatherOp. -void LayoutInfoPropagation::visitLoadGatherOp( - xegpu::LoadGatherOp load, ArrayRef operands, - ArrayRef results) { - LayoutInfo valueLayout = results[0]->getValue(); - // Need the layout of the value to propagate to the tensor descriptor. - if (!valueLayout.isAssigned()) - return; - - LayoutInfo tensorDescLayout = valueLayout; - if (load.getTranspose()) { - // LoadGatherOp has the transpose effect. However, at the stage of this - // analyis this effect is not expected and should be abstracted away. Emit - // a warning. - load.emitWarning("Transpose effect is not expected for LoadGatherOp at " - "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.getTransposedLayout({1, 0}); - } - // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultLayoutInfo(1); - // Propagate the new layout to the tensor descriptor operand. - propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); - // Propagate the new layout to the mask operand. - propagateIfChanged(operands[1], operands[1]->meet(maskLayout)); -} - -/// Propagate the layout of the descriptor to the vector offset operand in -/// CreateDescOp. -void LayoutInfoPropagation::visitCreateDescOp( - xegpu::CreateDescOp createDesc, ArrayRef operands, - ArrayRef results) { - LayoutInfo descLayout = results[0]->getValue(); - // Need the layout of the descriptor to propagate to the operands. - if (!descLayout.isAssigned()) - return; - // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultLayoutInfo(1); - propagateIfChanged(operands[1], operands[1]->meet(layout)); -} - -/// Set the layout for the value, tensor descriptor, and mask operands in the -/// StoreScatterOp. -void LayoutInfoPropagation::visitStoreScatterOp( - xegpu::StoreScatterOp storeScatter, ArrayRef operands, - ArrayRef results) { - // Currently, for 2D StoreScatterOp we expect that the height dimension of - // the tensor descriptor is equal to the subgroup size. This is ensured by - // the op verifier. - ArrayRef tdescShape = storeScatter.getTensorDescType().getShape(); - if (tdescShape.size() > 1) - assert( - tdescShape[0] == subgroupSize && - "Expected the first dimension of 2D tensor descriptor to be equal to " - "subgroup size."); - - LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType()); - LayoutInfo storeScatterLayout = valueLayout; - if (storeScatter.getTranspose()) { - // StoreScatteOp allows transpose effect. However, at the stage of this - // analyis this effect is not expected and should be abstracted away. Emit - // a warning. - storeScatter.emitWarning("Transpose effect is not expected for " - "StoreScatterOp at LayoutInfoPropagation stage."); - storeScatterLayout = valueLayout.getTransposedLayout({1, 0}); - } - // Propagate the value layout. - propagateIfChanged(operands[0], operands[0]->meet(valueLayout)); - // Propagate the tensor descriptor layout. - propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout)); - // Use default 1D layout for mask operand. - LayoutInfo maskLayout = getDefaultLayoutInfo(1); - propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); -} - -namespace { - -//===----------------------------------------------------------------------===// -// RunLayoutInfoPropagation -//===----------------------------------------------------------------------===// - -/// Driver class for running the LayoutInfoPropagation analysis. -class RunLayoutInfoPropagation { -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation) - - RunLayoutInfoPropagation(Operation *op) : target(op) { - SymbolTableCollection symbolTable; - solver.load(); - solver.load(); - solver.load(symbolTable); - (void)solver.initializeAndRun(op); - } - - LayoutInfo getLayoutInfo(Value val); - - void printAnalysisResult(llvm::raw_ostream &os); - -private: - DataFlowSolver solver; - const Operation *target; -}; -} // namespace - -LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) { - auto *state = solver.lookupState(val); - if (!state) - return {}; - return state->getValue(); -} - -void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { - auto printFunctionResult = [&](FunctionOpInterface funcOp) { - os << "function: " << funcOp.getName() << ":\n"; - // Function arguments - for (BlockArgument arg : funcOp.getArguments()) { - LayoutInfo layout = getLayoutInfo(arg); - os << "argument: " << arg << "\n"; - os << "layout : "; - layout.print(os); - os << "\n"; - } - // Function ops - funcOp.walk([&](Operation *op) { - // Skip ops that do not have results - if (op->getResults().empty()) - return; - os << "op : "; - // For control-flow ops, print the op name only. - if (isa(op) || isa(op)) - os << op->getName(); - else - op->print(os); - os << "\n"; - // Print the layout for each result. - for (auto [i, r] : llvm::enumerate(op->getResults())) { - LayoutInfo layout = getLayoutInfo(r); - os << "layout for result #" << i << ": "; - layout.print(os); - os << "\n"; - } - }); - }; - - SmallVector funcOps; - if (auto modOp = dyn_cast(target)) { - for (auto funcOp : modOp.getOps()) { - funcOps.push_back(funcOp); - } - // Collect all GpuFuncOps in the module. - for (auto gpuModOp : modOp.getOps()) { - for (auto gpuFuncOp : gpuModOp.getOps()) { - funcOps.push_back(gpuFuncOp); - } - } - } - // Print the analysis result for each function. - for (FunctionOpInterface funcOp : funcOps) { - printFunctionResult(funcOp); - } -} - -// namespace { - -//===----------------------------------------------------------------------===// -// LayoutAttrAssignment -//===----------------------------------------------------------------------===// -// template -// class UpdateTensorDescType : public OpConversionPattern { -// public: -// UpdateTensorDescType(MLIRContext *context, -// function_ref -// getLayoutOfValue, TypeConverter &typeConverter, -// PatternBenefit benefit = 1) -// : OpConversionPattern(typeConverter, context, benefit), -// getLayoutOfValue(getLayoutOfValue) {} -// using OpConversionPattern::OpConversionPattern; -// LogicalResult -// matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, -// ConversionPatternRewriter &rewriter) const override { -// // Op must have single result. -// if (op->getNumResults() != 1) -// return failure(); -// Type resultType = op->getResult(0).getType(); -// // Result type must be a tensor descriptor type. -// if (!isa(resultType)) { -// LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: " -// << resultType << "\n"); -// return failure(); -// } -// auto assignedLayout = getLayoutOfValue(op.getResult()); -// if (!assignedLayout) { -// LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n"); -// return failure(); -// } -// // Get the original tensor descriptor type. -// auto origTensorDescTy = dyn_cast(resultType); -// auto newTensorDescTy = xegpu::TensorDescType::get( -// origTensorDescTy.getContext(), origTensorDescTy.getShape(), -// origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(), -// assignedLayout); -// rewriter.replaceOpWithNewOp(op, newTensorDescTy, -// adaptor.getOperands(), op->getAttrs()); -// return success(); -// } - -// private: -// function_ref getLayoutOfValue; -// }; -// /// This class is responsible for assigning the layout attributes to the ops -// and -// /// their users based on the layout propagation analysis result. -// class LayoutAttrAssignment { -// public: -// LayoutAttrAssignment(Operation *top, -// function_ref getLayout) -// : getAnalysisResult(getLayout), top(top) {} - -// LogicalResult run(); - -// private: -// LogicalResult assign(Operation *op); -// void assignToUsers(Value v, xegpu::LayoutAttr layout); -// xegpu::LayoutAttr getLayoutAttrForValue(Value v); -// LogicalResult resolveConflicts(); -// // Callable to get the layout of a value based on the layout propagation -// // analysis. -// function_ref getAnalysisResult; -// Operation *top; -// }; - -// } // namespace - -// /// Helper to assign the layout attribute to the users of the value. -// void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) { -// for (OpOperand &user : v.getUses()) { -// Operation *owner = user.getOwner(); -// unsigned operandNumber = user.getOperandNumber(); -// // Use a generic name for ease of querying the layout attribute later. -// std::string attrName = -// operandLayoutNamePrefix + std::to_string(operandNumber); -// owner->setAttr(attrName, layout); -// } -// } - -// /// Convert the layout assigned to a value to xegpu::LayoutAttr. -// xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) { -// llvm::errs() << "getLayoutAttrForValue: " << v << "\n"; -// LayoutInfo layout = getAnalysisResult(v); -// if (!layout.isAssigned()) { -// llvm::errs() << "No layout assigned for value\n"; -// return {}; -// } -// SmallVector laneLayout, laneData; -// for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), -// layout.getDataAsArrayRef())) { -// laneLayout.push_back(static_cast(layout)); -// laneData.push_back(static_cast(data)); -// } -// llvm::errs() << "return layout\n"; -// return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData); -// } - -// /// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned -// /// based on the layout propagation analysis result. -// LogicalResult LayoutAttrAssignment::assign(Operation *op) { -// // For function ops, propagate the function argument layout to the users. -// if (auto func = dyn_cast(op)) { -// for (BlockArgument arg : func.getArguments()) { -// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg); -// if (layoutInfo) { -// assignToUsers(arg, layoutInfo); -// } -// } -// return success(); -// } -// // If no results, move on. -// if (op->getNumResults() == 0) -// return success(); -// // If all the results are scalars, move on. -// if (llvm::all_of(op->getResultTypes(), -// [](Type t) { return t.isIntOrIndexOrFloat(); })) -// return success(); -// // If the op has more than one result and at least one result is a tensor -// // descriptor, exit. This case is not supported yet. -// // TODO: Support this case. -// if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type -// t) { -// return isa(t); -// })) { -// LLVM_DEBUG( -// DBGS() << op->getName() -// << " op has more than one result and at least one is a tensor -// " -// "descriptor. This case is not handled.\n"); -// return failure(); -// } -// // If the result is a tensor descriptor, attach the layout to the tensor -// // descriptor itself. -// if (auto tensorDescTy = -// dyn_cast(op->getResultTypes()[0])) { -// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0)); -// if (!layoutInfo) { -// LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n"); -// return failure(); -// } - -// // Clone the op, attach the layout to the result tensor descriptor, and -// // remove the original op. -// OpBuilder builder(op); -// Operation *newOp = builder.clone(*op); -// auto newTensorDescTy = xegpu::TensorDescType::get( -// tensorDescTy.getContext(), tensorDescTy.getShape(), -// tensorDescTy.getElementType(), tensorDescTy.getEncoding(), -// layoutInfo); -// newOp->getResult(0).setType(newTensorDescTy); -// op->replaceAllUsesWith(newOp->getResults()); -// op->erase(); -// return success(); -// } -// // Otherwise simply attach the layout to the op itself. -// for (auto [i, r] : llvm::enumerate(op->getResults())) { -// xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r); -// if (layoutInfo) { -// std::string attrName = resultLayoutNamePrefix + std::to_string(i); -// op->setAttr(attrName, layoutInfo); -// // Attach the layout attribute to the users of the result. -// assignToUsers(r, layoutInfo); -// } -// } -// return success(); -// } - -// /// Walk the IR and attach xegpu::LayoutAttr to all ops and their users. -// LogicalResult LayoutAttrAssignment::run() { -// // auto walkResult = top->walk([&](Operation *op) { -// // if (failed(assign(op))) -// // return WalkResult::interrupt(); -// // return WalkResult::advance(); -// // }); - -// // if (walkResult.wasInterrupted()) -// // return failure(); -// // apply the UpdateTensorDescType pattern to all ops -// // RewritePatternSet patterns(top->getContext()); -// // patterns.add( -// // top->getContext(), [&](Value v) -> xegpu::LayoutAttr { -// // llvm::errs() << "invoking callback for value\n"; -// // return getLayoutAttrForValue(v); -// // }); -// // if (failed(applyPatternsGreedily(top, std::move(patterns)))) -// // return failure(); - -// return resolveConflicts(); -// } - -// /// TODO: Implement the layout conflict resolution. This must ensure mainly -// two -// /// things: -// /// 1) Is a given layout supported by the op? (need to query the target -// /// HW info). Otherwise can we achieve this layout using a layout -// conversion? -// /// 2) Do all the operands have the required layout? If not, can it -// /// be resolved using a layout conversion? -// LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); } -using GetLayoutCallbackFnTy = function_ref; -static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, - GetLayoutCallbackFnTy getLayoutOfValue) { - - // Iterate over all the results. - for (OpResult result : op->getResults()) { - Type resultType = result.getType(); - // Layouts are needed only for vector and tensor descriptor types. - if (!isa(resultType)) - continue; - // If the result has any users, we expect it to have a layout. - xegpu::LayoutAttr layout = getLayoutOfValue(result); - if (!layout && result.getNumUses() > 0) { - LLVM_DEBUG(DBGS() << "Expecting layout for result: " << result - << " but got none.\n"); - continue; - } - if (auto tensorDescTy = dyn_cast(resultType)) { - // TODO: Handle error. - auto typeWithLayout = xegpu::TensorDescType::get( - tensorDescTy.getContext(), tensorDescTy.getShape(), - tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - result.setType(typeWithLayout); - continue; - } - // If the result is a vector type, add a temporary layout attribute to the - // op. - std::string resultLayoutName = - resultLayoutNamePrefix + std::to_string(result.getResultNumber()); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - for (OpOperand &user : result.getUses()) { - Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Add temorary layout attribute at the user op. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); - owner->setAttr(attrName, layout); - } - } -} -static void updateBranchTerminatorOpInterface( - mlir::OpBuilder &builder, - mlir::RegionBranchTerminatorOpInterface terminator, - GetLayoutCallbackFnTy getLayoutOfValue) { - if (!mlir::isa(terminator->getParentOp())) - return; - - llvm::SmallVector successors; - llvm::SmallVector operands(terminator->getNumOperands(), - nullptr); - terminator.getSuccessorRegions(operands, successors); - - for (mlir::RegionSuccessor &successor : successors) { - if (!successor.isParent()) - continue; - - mlir::OperandRange operands = terminator.getSuccessorOperands(successor); - mlir::ValueRange inputs = successor.getSuccessorInputs(); - for (auto [operand, input] : llvm::zip(operands, inputs)) { - // print arg and inp - // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n"; - Type inputType = input.getType(); - if (!isa(inputType)) - continue; - xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); - xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); - - if (!operandLayout) { - LLVM_DEBUG(DBGS() << "Expecting layout for region successor operand : " - << operand << " but got none.\n"); - continue; - } - - if (inputLayout && inputLayout != operandLayout) { - LLVM_DEBUG( - DBGS() - << "Conflicting layouts for region successor operand and input: " - << inputLayout << " vs " << operandLayout << "\n"); - continue; - } - // Get tensor descriptor type with the layout. - auto tdescTy = dyn_cast(inputType); - auto newTdescTy = xegpu::TensorDescType::get( - tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), operandLayout); - input.setType(newTdescTy); - } - } -} -static void updateBranchOpInterface(mlir::OpBuilder &builder, - mlir::RegionBranchOpInterface branch, - GetLayoutCallbackFnTy getLayoutOfValue) { - mlir::Operation *op = branch.getOperation(); - llvm::SmallVector successors; - llvm::SmallVector operands(op->getNumOperands(), nullptr); - branch.getEntrySuccessorRegions(operands, successors); - DenseMap resultToLayouts; - mlir::ValueRange results = op->getResults(); - - for (mlir::RegionSuccessor &successor : successors) { - if (successor.isParent()) - continue; - - mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor); - mlir::ValueRange inputs = successor.getSuccessorInputs(); - - for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) { - Type inputType = input.getType(); - if (!isa(inputType)) - continue; - xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(input); - xegpu::LayoutAttr initArgLayout = getLayoutOfValue(operand); - - if (!blockArgLayout || !initArgLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input - << " or init arg: " << operand << "\n"); - continue; - } - - // TOOD: We expect these two to match. Data flow analysis will ensure - // this. - assert(blockArgLayout == initArgLayout && - "Expexing block arg and init arg to have the same layout."); - // Get tensor descriptor type with the layout. - auto tdescTy = dyn_cast(inputType); - auto newTdescTy = xegpu::TensorDescType::get( - tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), blockArgLayout); - input.setType(newTdescTy); - // Store the layout for the result. - if (resultToLayouts.count(result) != 0 && - resultToLayouts[result] != blockArgLayout) { - LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result - << " - " << resultToLayouts[result] << " vs " - << blockArgLayout << "\n"); - } else { - resultToLayouts[result] = blockArgLayout; - } - } - } - for (auto [i, r] : llvm::enumerate(op->getResults())) { - Type resultType = r.getType(); - if (!isa(resultType)) - continue; - xegpu::LayoutAttr layout = getLayoutOfValue(r); - if (!layout) - layout = resultToLayouts[r]; - if (!layout) { - LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result:" - << r << "\n"); - continue; - } - if (auto tensorDescTy = dyn_cast(resultType)) { - auto newTdescTy = xegpu::TensorDescType::get( - tensorDescTy.getContext(), tensorDescTy.getShape(), - tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - r.setType(newTdescTy); - continue; - } - // If the result is a vector type, add a temporary layout attribute to - // the op. - std::string resultLayoutName = - resultLayoutNamePrefix + std::to_string(r.getResultNumber()); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - for (OpOperand &user : r.getUses()) { - Operation *owner = user.getOwner(); - unsigned operandNumber = user.getOperandNumber(); - // Add temporary layout attribute at the user op. - std::string attrName = - operandLayoutNamePrefix + std::to_string(operandNumber); - owner->setAttr(attrName, layout); - } - } -} - -namespace { - //===----------------------------------------------------------------------===// // SIMT Distribution Patterns //===----------------------------------------------------------------------===// @@ -1845,46 +833,6 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( } void XeGPUSubgroupDistributePass::runOnOperation() { - auto &analyis = getAnalysis(); - // Print the analysis result and exit. (for testing purposes) - if (printOnly) { - auto &os = llvm::outs(); - analyis.printAnalysisResult(os); - return; - } - - auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { - LayoutInfo layout = analyis.getLayoutInfo(val); - if (!layout.isAssigned()) { - return {}; - } - SmallVector laneLayout, laneData; - for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), - layout.getDataAsArrayRef())) { - laneLayout.push_back(static_cast(layout)); - laneData.push_back(static_cast(data)); - } - return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData); - }; - - mlir::OpBuilder builder(&getContext()); - Operation *op = getOperation(); - op->walk([&](mlir::Block *block) { - for (mlir::Operation &op : llvm::reverse(block->getOperations())) { - if (auto terminator = - mlir::dyn_cast(op)) { - updateBranchTerminatorOpInterface(builder, terminator, - getXeGPULayoutForValue); - continue; - } - - if (auto iface = mlir::dyn_cast(op)) { - updateBranchOpInterface(builder, iface, getXeGPULayoutForValue); - continue; - } - updateOp(builder, &op, getXeGPULayoutForValue); - } - }); // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 // operation. From 92c23f189b06d0dd5df702774e5788fd53c1d67b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 16:40:23 +0000 Subject: [PATCH 14/44] fix test --- .../Dialect/XeGPU/subgroup-distribution.mlir | 252 +++++++++--------- 1 file changed, 125 insertions(+), 127 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir index b5f6bda26d830..0f236d4e8b9dc 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute -cse -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @store_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { @@ -7,13 +7,13 @@ // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return gpu.module @test { -gpu.func @store_nd_1d(%arg0: memref<16xf32>){ - %c0 = arith.constant 0 : index - %1 = arith.constant dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> - xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - gpu.return -} + gpu.func @store_nd_1d(%arg0: memref<16xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %0 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + gpu.return + } } // ----- @@ -23,13 +23,13 @@ gpu.func @store_nd_1d(%arg0: memref<16xf32>){ // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @test { -gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){ - %c0 = arith.constant 0 : index - %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - gpu.return -} + gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %cst, %0 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.return + } } @@ -42,14 +42,14 @@ gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){ // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @test { -gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - gpu.return -} + gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + gpu.return + } } // ----- @@ -60,14 +60,14 @@ gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){ // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @test { -gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - gpu.return -} + gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.return + } } // ----- @@ -81,15 +81,15 @@ gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){ // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> // CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @test { -gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> - %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16> - %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - gpu.return -} + gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> + %2 = vector.extract %1[%c0] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> + %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %2, %3 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.return + } } // ----- @@ -103,17 +103,17 @@ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16 // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - gpu.return -} + gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %4 = xegpu.dpas %1, %3 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %5 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return + } } @@ -131,22 +131,21 @@ gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %ar // CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = math.exp %4 : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - gpu.return -} + gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %4 = xegpu.dpas %1, %3 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = math.exp %4 {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf32> + %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %5, %6 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return + } } // ----- -gpu.module @test { // CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, @@ -155,15 +154,15 @@ gpu.module @test { // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, - %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 [%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - gpu.return -} +gpu.module @test { + gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.return + } } // ----- @@ -191,31 +190,30 @@ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, // CHECK-NEXT: %[[C_FINAL:.*]] = vector.shape_cast %[[T7]]#0 : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: xegpu.store_nd %[[C_FINAL]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c1024 = arith.constant 1024 : index - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = arith.muli %0, %c8 : index - %3 = arith.muli %1, %c16 : index - %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> - %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> - %8 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> - %6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5, %arg5 = %7, %arg6 = %8) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>) { - %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16> - %10 = xegpu.load_nd %arg6 : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16> - %12 = xegpu.update_nd_offset %arg5, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16> - %13 = xegpu.update_nd_offset %arg6, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16> - %11 = xegpu.dpas %9, %10, %arg4 : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield %11, %12, %13 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16> + gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c8 : index + %1 = arith.muli %block_id_y, %c16 : index + %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + %6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3, %arg5 = %4, %arg6 = %5) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xbf16, #xegpu.layout>) { + %8 = xegpu.load_nd %arg5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> + %9 = xegpu.load_nd %arg6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> + %10 = xegpu.update_nd_offset %arg5, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %11 = xegpu.update_nd_offset %arg6, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + %12 = xegpu.dpas %8, %9, %arg4 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield {layout_operand_0 = #xegpu.layout} %12, %10, %11 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + } {layout_operand_3 = #xegpu.layout, layout_result_0 = #xegpu.layout} + xegpu.store_nd %6#0, %2 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return } - %12 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %6#0, %12 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - gpu.return -} } // ----- @@ -226,15 +224,15 @@ gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16> // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @test { -gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){ - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %1 = arith.constant dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> - %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - gpu.return -} + gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %1 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + gpu.return + } } // ----- @@ -245,15 +243,15 @@ gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>){ // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32> gpu.module @test { -gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> - %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> - xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> - gpu.return -} + gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + xegpu.store_nd %cst, %1 {layout_operand_0 = #xegpu.layout} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + gpu.return + } } // ----- @@ -262,12 +260,12 @@ gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> gpu.module @test { -gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> - gpu.return -} + gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.return + } } // ----- @@ -276,10 +274,10 @@ gpu.func @prefetch_2d(%arg0: memref<256x256xf16>){ // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> gpu.module @test { -gpu.func @prefetch_1d(%arg0: memref<256xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> - gpu.return -} + gpu.func @prefetch_1d(%arg0: memref<256xf16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> + gpu.return + } } From 7b69082fa2fd3d54ac164ebeae43ed464ab30d6a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 16:46:24 +0000 Subject: [PATCH 15/44] fix names --- .../{subgroup-map-propagation.mlir => layout-propagate.mlir} | 0 .../{subgroup-distribution.mlir => subgroup-distribute.mlir} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename mlir/test/Dialect/XeGPU/{subgroup-map-propagation.mlir => layout-propagate.mlir} (100%) rename mlir/test/Dialect/XeGPU/{subgroup-distribution.mlir => subgroup-distribute.mlir} (100%) diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/layout-propagate.mlir similarity index 100% rename from mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir rename to mlir/test/Dialect/XeGPU/layout-propagate.mlir diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir similarity index 100% rename from mlir/test/Dialect/XeGPU/subgroup-distribution.mlir rename to mlir/test/Dialect/XeGPU/subgroup-distribute.mlir From 56696165ff7886a802d1334f0826e50373d47b2b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 17:42:15 +0000 Subject: [PATCH 16/44] func op iface support --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 53 +++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index f308d338b511a..d876110fe2692 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -873,6 +873,46 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, } } +static void updateFunctionOpInterface(mlir::OpBuilder &builder, + mlir::FunctionOpInterface funcOp, + GetLayoutCallbackFnTy getLayoutOfValue) { + SmallVector newArgTypes; + // Update the function arguments. + for (BlockArgument arg : funcOp.getArguments()) { + Type argType = arg.getType(); + newArgTypes.push_back(argType); + if (!isa(argType)) + continue; + xegpu::LayoutAttr layout = getLayoutOfValue(arg); + if (!layout) { + LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg + << " but got none.\n"); + continue; + } + if (auto tensorDescTy = dyn_cast(argType)) { + auto newTdescTy = xegpu::TensorDescType::get( + tensorDescTy.getContext(), tensorDescTy.getShape(), + tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); + arg.setType(newTdescTy); + newArgTypes.back() = newTdescTy; + continue; + } + // If the argument is a vector type, update all the users of the argument + // with the layout. + for (OpOperand &user : arg.getUses()) { + Operation *owner = user.getOwner(); + unsigned operandNumber = user.getOperandNumber(); + std::string attrName = + operandLayoutNamePrefix + std::to_string(operandNumber); + owner->setAttr(attrName, layout); + } + } + // Update the function type with the new argument types. + // NOTE: We assume that function results are not expected to have layouts. + funcOp.setType(FunctionType::get(funcOp.getContext(), newArgTypes, + funcOp.getResultTypes())); +} + namespace { struct XeGPULayoutPropagatePass final @@ -903,15 +943,20 @@ void XeGPULayoutPropagatePass::runOnOperation() { Operation *op = getOperation(); op->walk([&](mlir::Block *block) { for (mlir::Operation &op : llvm::reverse(block->getOperations())) { - if (auto terminator = + if (auto branchTermOp = mlir::dyn_cast(op)) { - updateBranchTerminatorOpInterface(builder, terminator, + updateBranchTerminatorOpInterface(builder, branchTermOp, getXeGPULayoutForValue); continue; } - if (auto iface = mlir::dyn_cast(op)) { - updateBranchOpInterface(builder, iface, getXeGPULayoutForValue); + if (auto regionBrOp = mlir::dyn_cast(op)) { + updateBranchOpInterface(builder, regionBrOp, getXeGPULayoutForValue); + continue; + } + + if (auto funcOp = mlir::dyn_cast(op)) { + updateFunctionOpInterface(builder, funcOp, getXeGPULayoutForValue); continue; } updateOp(builder, &op, getXeGPULayoutForValue); From 71902aa6c8eb28ee13c7b802951ae5a5c1195ef7 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 20:00:53 +0000 Subject: [PATCH 17/44] fix test --- mlir/test/Dialect/XeGPU/layout-propagate.mlir | 511 +++++------------- 1 file changed, 134 insertions(+), 377 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/layout-propagate.mlir b/mlir/test/Dialect/XeGPU/layout-propagate.mlir index c7c82fc8dbb3c..f698b997e8cb7 100644 --- a/mlir/test/Dialect/XeGPU/layout-propagate.mlir +++ b/mlir/test/Dialect/XeGPU/layout-propagate.mlir @@ -1,29 +1,16 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xegpu-layout-propagate -split-input-file %s | FileCheck %s -// CHECK: function: test_dpas_f16: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @dpas_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_operand_2 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -36,22 +23,11 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg return } - // ----- -// CHECK: function: test_dpas_i8: -// CHECK-NEXT: argument: of type 'vector<8x32xi8>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: argument: of type 'vector<32x16xi8>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { +// CHECK-LABEL: func.func @dpas_i8( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { +// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> @@ -60,30 +36,10 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: } // ----- -// CHECK: function: test_load_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @load_with_transpose_effect( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -97,32 +53,10 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre } // ----- -// CHECK: function: test_vector_transpose: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @vector_transpose( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> +func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -137,22 +71,11 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1 } // ----- -// CHECK: function: test_extf_truncf: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: Not assigned. -func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { +// CHECK-LABEL: func.func @extf_truncf( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { +// CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf32> +// CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf32> to vector<16x16xf16> +func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32> @@ -162,32 +85,13 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t } // ----- -// CHECK: function: test_load_gather_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<256xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @load_gather_with_transpose_effect( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> +// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] {layout_operand_1 = #xegpu.layout} : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> +func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> @@ -202,20 +106,13 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1 } // ----- -// CHECK: function: test_load_gather_1d: -// CHECK: argument: of type 'memref<256xf32>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +// CHECK-LABEL: func.func @load_gather_1d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> +// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] {layout_operand_1 = #xegpu.layout} : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> +func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> @@ -225,18 +122,11 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc } // ----- -// CHECK: function: test_store_scatter_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<128xf32>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1] -func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { +// CHECK-LABEL: func.func @store_scatter_with_transpose_effect( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} {layout_operand_1 = #xegpu.layout} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> +func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> %cst_0 = arith.constant dense : vector<16xi1> %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -246,18 +136,10 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { } // ----- -// CHECK: function: test_store_scatter_1d: -// CHECK-NEXT: argument: of type 'vector<16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: argument: of type 'memref<256xf32>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { +// CHECK-LABEL: func.func @store_scatter_1d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { +// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> +func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> @@ -266,30 +148,10 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) } // ----- -// CHECK: function: test_vector_bitcast_i16_to_i8: -// CHECK-NEXT: argument: of type 'memref<8x16xi16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<32x16xi8>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { +// CHECK-LABEL: func.func @vector_bitcast_i16_to_i8( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x32xi8> +func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> @@ -303,32 +165,11 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref< } // ----- -// CHECK: function: test_vector_bitcast_i8_to_f16: -// CHECK-NEXT: argument: of type 'memref<8x32xi8>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x32xi8>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @vector_bitcast_i8_to_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x32xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x32xi8> to vector<8x16xf16> +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x32xi8> to vector<16x16xf16> +func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> @@ -343,24 +184,12 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1 } // ----- -// CHECK: function: test_binary_op_one_use: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { +// CHECK-LABEL: func.func @binary_op_one_use( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> +func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %2 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -371,26 +200,13 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x } // ----- -// CHECK: function: test_binary_op_multiple_uses: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { +// CHECK-LABEL: func.func @binary_op_multiple_uses( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16> @@ -402,42 +218,22 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar } // ----- -// CHECK: function: test_for_op: -// CHECK-NEXT: argument: of type 'memref<8x128xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<128x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 128 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 16 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : scf.for -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: layout for result #1: Not assigned. -// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { +// CHECK-LABEL: func.func @for_op( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> +// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: scf.yield {layout_operand_2 = #xegpu.layout} %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> +// CHECK-NEXT: } {layout_operand_5 = #xegpu.layout, layout_result_2 = #xegpu.layout} +// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c16 = arith.constant 16 : index @@ -458,26 +254,16 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg } // ----- -// CHECK: function: test_if_single_use: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : scf.if -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { +// CHECK-LABEL: func.func @if_single_use( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { +// CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T3]] : vector<16x16xf16> +// CHECK-NEXT: } else { +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T4]] : vector<16x16xf16> +// CHECK-NEXT: } {layout_result_0 = #xegpu.layout} +func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -492,28 +278,16 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu } // ----- -// CHECK: function: test_if_multiple_uses: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 4 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : scf.if -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { +// CHECK-LABEL: func.func @if_multiple_uses( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T3]] : vector<16x16xf16> +// CHECK-NEXT: } else { +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T4]] : vector<16x16xf16> +// CHECK-NEXT: } {layout_result_0 = #xegpu.layout} +func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -529,16 +303,10 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe } // ----- -// CHECK: function: test_vector_outer_reduction: -// CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +// CHECK-LABEL: func.func @vector_outer_reduction( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { +// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} [0] : vector<16x16xf32> to vector<16xf32> +func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> @@ -546,16 +314,10 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t } // ----- -// CHECK: function: test_vector_inner_reduction: -// CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { +// CHECK-LABEL: func.func @vector_inner_reduction( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { +// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} [1] : vector<16x16xf32> to vector<16xf32> +func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32> xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> @@ -563,13 +325,10 @@ func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t } // ----- -// CHECK: function: update_nd_offset_1d: -// CHECK: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] +// CHECK-LABEL: func.func @update_nd_offset_1d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> +// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32, #xegpu.layout> func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -581,13 +340,10 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ } // ----- -// CHECK: function: update_nd_offset_2d: -// CHECK: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] +// CHECK-LABEL: func.func @update_nd_offset_2d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> +// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -599,10 +355,10 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ } // ----- -// CHECK: function: prefetch_2d: -// CHECK: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] +// CHECK-LABEL: func.func @prefetch_2d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @prefetch_2d(%arg0: memref<256x256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> @@ -611,9 +367,10 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ } // ----- -// CHECK: function: prefetch_1d: -// CHECK: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] +// CHECK-LABEL: func.func @prefetch_1d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> func.func @prefetch_1d(%arg0: memref<256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> From 341daff6dd9f95fcd6a73240f6edb108a8e50b77 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 20:15:04 +0000 Subject: [PATCH 18/44] fix test --- .../Dialect/XeGPU/subgroup-distribute.mlir | 84 +++++++++---------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 0f236d4e8b9dc..3bfabac55faf3 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -168,52 +168,48 @@ gpu.module @test { // ----- // CHECK-LABEL: gpu.func @gemm_loop // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { -// CHECK: %[[BLOCK_ID_X:.*]] = gpu.block_id x -// CHECK: %[[BLOCK_ID_Y:.*]] = gpu.block_id y -// CHECK: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index -// CHECK: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index +// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x +// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y +// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index +// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// CHECK-DAG: %[[C_INIT:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// CHECK-DAG: %[[B_TILE:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}, %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[A_TILE:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %{{.*}}] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK: %[[T7:.*]]:3 = scf.for {{.*}} iter_args(%[[C_VAL:.*]] = %[[C_INIT]], %[[A_ARG:.*]] = %[[A_TILE]], %[[B_ARG:.*]] = %[[B_TILE]]) -> (vector<8x1xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16>) { -// CHECK-DAG: %[[B_NEXT:.*]] = xegpu.update_nd_offset %[[B_ARG]], [{{.*}}] : !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[A_NEXT:.*]] = xegpu.update_nd_offset %[[A_ARG]], [{{.*}}] : !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[B_ARG]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[A_ARG]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// CHECK-DAG: %[[C:.*]] = vector.shape_cast %[[C_VAL]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: %[[T8:.*]] = xegpu.dpas %[[A]], %[[B]], %[[C]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// CHECK-NEXT: %[[C_OUT:.*]] = vector.shape_cast %[[T8]] : vector<8xf32> to vector<8x1xf32> -// CHECK-NEXT: scf.yield %[[C_OUT]], %[[A_NEXT]], %[[B_NEXT]] : vector<8x1xf32>, !xegpu.tensor_desc<8x16xbf16>, !xegpu.tensor_desc<16x16xbf16> -// CHECK-NEXT:} -// CHECK-NEXT: %[[C_FINAL:.*]] = vector.shape_cast %[[T7]]#0 : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[C_FINAL]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> +// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { - gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>) { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c1024 = arith.constant 1024 : index - %block_id_x = gpu.block_id x - %block_id_y = gpu.block_id y - %0 = arith.muli %block_id_x, %c8 : index - %1 = arith.muli %block_id_y, %c16 : index - %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> - %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> - %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - %6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3, %arg5 = %4, %arg6 = %5) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xbf16, #xegpu.layout>) { - %8 = xegpu.load_nd %arg5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> - %9 = xegpu.load_nd %arg6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> - %10 = xegpu.update_nd_offset %arg5, [%c0, %c16] : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> - %11 = xegpu.update_nd_offset %arg6, [%c16, %c0] : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - %12 = xegpu.dpas %8, %9, %arg4 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield {layout_operand_0 = #xegpu.layout} %12, %10, %11 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xbf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - } {layout_operand_3 = #xegpu.layout, layout_result_0 = #xegpu.layout} - xegpu.store_nd %6#0, %2 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return - } +gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c8 : index + %1 = arith.muli %block_id_y, %c16 : index + %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { + %5 = xegpu.create_nd_tdesc %arg0[%0, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg1[%arg3, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + %7 = xegpu.load_nd %5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> + %8 = xegpu.load_nd %6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> + %9 = xegpu.dpas %7, %8, %arg4 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield {layout_operand_0 = #xegpu.layout} %9 : vector<8x16xf32> + } {layout_operand_3 = #xegpu.layout, layout_result_0 = #xegpu.layout} + xegpu.store_nd %4, %2 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return +} } // ----- From fdacb63e51af6de3a0deedddef30a10870d5d66b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 20:18:17 +0000 Subject: [PATCH 19/44] revert merge --- .../Vector/Transforms/VectorDistribute.cpp | 40 +++++-------------- .../Vector/vector-warp-distribute.mlir | 36 ----------------- 2 files changed, 10 insertions(+), 66 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index bd833ddb773f7..045c192787f10 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -1554,37 +1554,22 @@ struct WarpOpScfForOp : public WarpDistributionPattern { llvm::SmallSetVector escapingValues; SmallVector inputTypes; SmallVector distTypes; - auto collectEscapingValues = [&](Value value) { - if (!escapingValues.insert(value)) - return; - Type distType = value.getType(); - if (auto vecType = dyn_cast(distType)) { - AffineMap map = distributionMapFn(value); - distType = getDistributedType(vecType, map, warpOp.getWarpSize()); - } - inputTypes.push_back(value.getType()); - distTypes.push_back(distType); - }; - mlir::visitUsedValuesDefinedAbove( forOp.getBodyRegion(), [&](OpOperand *operand) { Operation *parent = operand->get().getParentRegion()->getParentOp(); if (warpOp->isAncestor(parent)) { - collectEscapingValues(operand->get()); + if (!escapingValues.insert(operand->get())) + return; + Type distType = operand->get().getType(); + if (auto vecType = dyn_cast(distType)) { + AffineMap map = distributionMapFn(operand->get()); + distType = getDistributedType(vecType, map, warpOp.getWarpSize()); + } + inputTypes.push_back(operand->get().getType()); + distTypes.push_back(distType); } }); - // Any forOp result that is not already yielded by the warpOp - // region is also considered escaping and must be returned by the - // original warpOp. - for (OpResult forResult : forOp.getResults()) { - // Check if this forResult is already yielded by the yield op. - if (llvm::is_contained(yield->getOperands(), forResult)) { - continue; - } - collectEscapingValues(forResult); - } - if (llvm::is_contained(distTypes, Type{})) return failure(); @@ -1624,12 +1609,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern { forOp.getResultTypes().end()); llvm::SmallDenseMap argIndexMapping; for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) { - auto newWarpResult = newWarpOp.getResult(retIdx); - // Unused forOp results yielded by the warpOp region are already included - // in the new ForOp. - if (llvm::is_contained(newOperands, newWarpResult)) - continue; - warpInput.push_back(newWarpResult); + warpInput.push_back(newWarpOp.getResult(retIdx)); argIndexMapping[escapingValues[i]] = warpInputType.size(); warpInputType.push_back(inputTypes[i]); } diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index 6c7ac7a5196a7..38771f2593449 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -584,42 +584,6 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 return } -// ----- -// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield( -// CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32> -// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32> -// CHECK-PROP: } -// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> () -func.func @warp_scf_for_unused_yield(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { - %ini = "some_def"() : () -> (vector<128xf32>) - %ini1 = "some_def"() : () -> (vector<128xf32>) - %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) { - %add = arith.addi %arg3, %c1 : index - %1 = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>) - %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) - scf.yield %acc, %1 : vector<128xf32>, vector<128xf32> - } - gpu.yield %3#0 : vector<128xf32> - } - "some_use"(%0) : (vector<4xf32>) -> () - return -} - - // ----- // CHECK-PROP-LABEL: func @vector_reduction( From 57acc9e1f06bedea779ddb3e0097948f353f3ede Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 20:20:11 +0000 Subject: [PATCH 20/44] add comment --- mlir/test/Dialect/XeGPU/subgroup-distribute.mlir | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 3bfabac55faf3..7362c175a70a4 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -166,6 +166,7 @@ gpu.module @test { } // ----- +// TODO: gemm does not use update_nd_offset because of an issue in vector distribution. PR141853 tracks this issue. // CHECK-LABEL: gpu.func @gemm_loop // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { // CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x From a99ee751d4112c152017805449ce2c623d906adb Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 21:14:25 +0000 Subject: [PATCH 21/44] refactor --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 14 ++++++++ .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 35 ++++++++----------- .../Transforms/XeGPUSubgroupDistribute.cpp | 14 ++------ 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index f9327d63869c0..23f44dcb8725d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -24,6 +24,20 @@ class LayoutAttr; class TensorDescType; } // namespace xegpu +namespace xegpu { +/// HW dependent constants. +/// TODO: These constants should be queried from the target information. +namespace targetinfo { +constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. +/// If DPAS A or B operands have low precision element types they must be packed +/// according to the following sizes. +constexpr unsigned packedSizeInBitsForDefault = + 16; // Minimum packing size per register for DPAS A. +constexpr unsigned packedSizeInBitsForDpasB = + 32; // Minimum packing size per register for DPAS B. +} // namespace targetinfo +} // namespace xegpu + namespace xegpu { /// If tensor descriptor has a layout attribute it is used in SIMT mode. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index ce2b1454fb6a0..fb69498dacb54 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -46,16 +46,6 @@ namespace xegpu { using namespace mlir; using namespace mlir::dataflow; -/// HW dependent constants. -/// TODO: These constants should be queried from the target information. -constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. -/// If DPAS A or B operands have low precision element types they must be packed -/// according to the following sizes. -constexpr unsigned packedSizeInBitsForDefault = - 16; // Minimum packing size per register for DPAS A. -constexpr unsigned packedSizeInBitsForDpasB = - 32; // Minimum packing size per register for DPAS B. - namespace { //===----------------------------------------------------------------------===// @@ -198,8 +188,10 @@ struct LayoutInfoLattice : public Lattice { static LayoutInfo getDefaultLayoutInfo(unsigned rank) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) - return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1})); - return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1})); + return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), + LaneData({1})); + return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), + LaneData({1, 1})); } /// Helper to get the default layout for a vector type. @@ -216,9 +208,9 @@ static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { // Packing factor is determined by the element type bitwidth. int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); - if (bitwidth < packedSizeInBitsForDefault) - packingFactor = packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(LaneLayout({1, subgroupSize}), + if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) + packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; + return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), LaneData({1, packingFactor})); } @@ -233,13 +225,14 @@ static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - LaneLayout layout({1, subgroupSize}); + LaneLayout layout({1, xegpu::targetinfo::subgroupSize}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. - if (operandNum == 1 && - elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) { - LaneData data( - {packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1}); + if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < + xegpu::targetinfo::packedSizeInBitsForDpasB) { + LaneData data({xegpu::targetinfo::packedSizeInBitsForDpasB / + elementTy.getIntOrFloatBitWidth(), + 1}); return LayoutInfo(layout, data); } // Otherwise, return the default layout for the vector type. @@ -577,7 +570,7 @@ void LayoutInfoPropagation::visitStoreScatterOp( ArrayRef tdescShape = storeScatter.getTensorDescType().getShape(); if (tdescShape.size() > 1) assert( - tdescShape[0] == subgroupSize && + tdescShape[0] == xegpu::targetinfo::subgroupSize && "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 9ddf3abe667e2..73da16cb2e3fb 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -58,15 +58,6 @@ namespace xegpu { using namespace mlir; -/// HW dependent constants. -/// TODO: These constants should be queried from the target information. -constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. -/// If DPAS A or B operands have low precision element types they must be packed -/// according to the following sizes. -constexpr unsigned packedSizeInBitsForDefault = - 16; // Minimum packing size per register for DPAS A. -constexpr unsigned packedSizeInBitsForDpasB = - 32; // Minimum packing size per register for DPAS B. static const char *const resolveSIMTTypeMismatch = "resolve_simt_type_mismatch"; // Attribute name for identifying // UnrelizedConversionCastOp added to resolve @@ -228,8 +219,9 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /** upperBound = **/ mlir::IntegerAttr()); ArrayRef gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); auto warpOp = rewriter.create( - laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize, - newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); + laneId.getLoc(), gpuFuncResultType, laneId, + xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(), + newGpuFunc.getArgumentTypes()); Block &warpBodyBlock = warpOp.getBodyRegion().front(); // Replace the ReturnOp of the original gpu function with a YieldOp. auto origRetunOp = From 739aad7a7743c96b7935622806de50e09ffa85bd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 21:26:48 +0000 Subject: [PATCH 22/44] refactor --- mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 4 ---- .../lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 5 ----- 2 files changed, 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index ee25eee688095..29f936e81974e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -27,10 +27,6 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; - let options = [Option< - "printOnly", "print-analysis-only", "bool", - /*default=*/"false", - "Print the result of the subgroup map propagation analysis and exit.">]; } def XeGPULayoutPropagate : Pass<"xegpu-layout-propagate"> { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 73da16cb2e3fb..221c309e18a4b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -800,11 +800,6 @@ namespace { struct XeGPUSubgroupDistributePass final : public xegpu::impl::XeGPUSubgroupDistributeBase< XeGPUSubgroupDistributePass> { - XeGPUSubgroupDistributePass() = default; - XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) = - default; - XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options) - : XeGPUSubgroupDistributeBase(options) {} void runOnOperation() override; }; } // namespace From 76b7333a088d8a58c5f1aa2b7d2b3740962332cc Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 5 Jun 2025 22:32:06 +0000 Subject: [PATCH 23/44] refactor --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 100 ++++++++++-------- .../Transforms/XeGPUSubgroupDistribute.cpp | 1 - 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index fb69498dacb54..5ee034570ad0c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -23,6 +23,7 @@ #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/ArrayRef.h" @@ -683,6 +684,22 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { } using GetLayoutCallbackFnTy = function_ref; +/// Helper to update the users of a value with a given layout. +static void updateUsers(Value v, xegpu::LayoutAttr layout) { + // Update all users of the value with the layout. + for (OpOperand &user : v.getUses()) { + Operation *owner = user.getOwner(); + // Add temporary layout attribute at the user op. + std::string attrName = xegpu::getLayoutName(user); + owner->setAttr(attrName, layout); + } +} + +/// Update an operation with the layout of its results. If the result type is a +/// vector type, a temporary layout attribute is added to the operation. If the +/// result type is a tensor descriptor type, the type is updated with the layout +/// attribute. The users of the result are also updated with the layout +/// attribute. static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutCallbackFnTy getLayoutOfValue) { @@ -712,14 +729,12 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, std::string resultLayoutName = xegpu::getLayoutName(result); op->setAttr(resultLayoutName, layout); // Update all users of the result with the layout. - for (OpOperand &user : result.getUses()) { - Operation *owner = user.getOwner(); - // Add temorary layout attribute at the user op. - std::string attrName = xegpu::getLayoutName(user); - owner->setAttr(attrName, layout); - } + updateUsers(result, layout); } } + +/// Update the types of successor regions of a branch terminator op (scf.yield) +/// with assigned layouts. static void updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, @@ -769,6 +784,10 @@ static void updateBranchTerminatorOpInterface( } } } + +/// Some operations contain multiple regions (like scf.for) each of which have +/// block arguments. This function updates the block arguments types of such +/// regions with the assigned layouts. static void updateBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, GetLayoutCallbackFnTy getLayoutOfValue) { @@ -790,33 +809,32 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, Type inputType = input.getType(); if (!isa(inputType)) continue; - xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(input); - xegpu::LayoutAttr initArgLayout = getLayoutOfValue(operand); + xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); + xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); - if (!blockArgLayout || !initArgLayout) { + if (!inputLayout || !operandLayout) { LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input << " or init arg: " << operand << "\n"); continue; } - // TOOD: We expect these two to match. Data flow analysis will ensure - // this. - assert(blockArgLayout == initArgLayout && + // TODO: We expect these two to match. + assert(inputLayout == operandLayout && "Expexing block arg and init arg to have the same layout."); // Get tensor descriptor type with the layout. auto tdescTy = dyn_cast(inputType); auto newTdescTy = xegpu::TensorDescType::get( tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), blockArgLayout); + tdescTy.getEncoding(), inputLayout); input.setType(newTdescTy); // Store the layout for the result. if (resultToLayouts.count(result) != 0 && - resultToLayouts[result] != blockArgLayout) { + resultToLayouts[result] != inputLayout) { LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result << " - " << resultToLayouts[result] << " vs " - << blockArgLayout << "\n"); + << inputLayout << "\n"); } else { - resultToLayouts[result] = blockArgLayout; + resultToLayouts[result] = inputLayout; } } } @@ -844,15 +862,11 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, std::string resultLayoutName = xegpu::getLayoutName(r); op->setAttr(resultLayoutName, layout); // Update all users of the result with the layout. - for (OpOperand &user : r.getUses()) { - Operation *owner = user.getOwner(); - // Add temporary layout attribute at the user op. - std::string attrName = xegpu::getLayoutName(user); - owner->setAttr(attrName, layout); - } + updateUsers(r, layout); } } +/// Update the function arguments and results with the layouts. static void updateFunctionOpInterface(mlir::OpBuilder &builder, mlir::FunctionOpInterface funcOp, GetLayoutCallbackFnTy getLayoutOfValue) { @@ -879,11 +893,7 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, } // If the argument is a vector type, update all the users of the argument // with the layout. - for (OpOperand &user : arg.getUses()) { - Operation *owner = user.getOwner(); - std::string attrName = xegpu::getLayoutName(user); - owner->setAttr(attrName, layout); - } + updateUsers(arg, layout); } // Update the function type with the new argument types. // NOTE: We assume that function results are not expected to have layouts. @@ -902,7 +912,7 @@ struct XeGPULayoutPropagatePass final void XeGPULayoutPropagatePass::runOnOperation() { auto &analyis = getAnalysis(); - + // Helper to convert LayoutInfo to xegpu::LayoutAttr. auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analyis.getLayoutInfo(val); if (!layout.isAssigned()) { @@ -921,23 +931,25 @@ void XeGPULayoutPropagatePass::runOnOperation() { Operation *op = getOperation(); op->walk([&](mlir::Block *block) { for (mlir::Operation &op : llvm::reverse(block->getOperations())) { - if (auto branchTermOp = - mlir::dyn_cast(op)) { - updateBranchTerminatorOpInterface(builder, branchTermOp, + TypeSwitch(&op) + .Case( + [&](mlir::RegionBranchTerminatorOpInterface branchTermOp) { + updateBranchTerminatorOpInterface(builder, branchTermOp, + getXeGPULayoutForValue); + }) + .Case( + [&](mlir::RegionBranchOpInterface regionBrOp) { + updateBranchOpInterface(builder, regionBrOp, + getXeGPULayoutForValue); + }) + .Case( + [&](mlir::FunctionOpInterface funcOp) { + updateFunctionOpInterface(builder, funcOp, getXeGPULayoutForValue); - continue; - } - - if (auto regionBrOp = mlir::dyn_cast(op)) { - updateBranchOpInterface(builder, regionBrOp, getXeGPULayoutForValue); - continue; - } - - if (auto funcOp = mlir::dyn_cast(op)) { - updateFunctionOpInterface(builder, funcOp, getXeGPULayoutForValue); - continue; - } - updateOp(builder, &op, getXeGPULayoutForValue); + }) + .Default([&](Operation *op) { + updateOp(builder, op, getXeGPULayoutForValue); + }); } }); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 221c309e18a4b..eb8192417f843 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -812,7 +812,6 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( } void XeGPUSubgroupDistributePass::runOnOperation() { - // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 // operation. { From cbcfd61b7c9c0e2d165ef319f57a978350ca6ddf Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 9 Jun 2025 23:32:50 +0000 Subject: [PATCH 24/44] address comments --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 7 ++- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 59 +++++++++++-------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 29f936e81974e..bf95dae69518d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -30,12 +30,13 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { } def XeGPULayoutPropagate : Pass<"xegpu-layout-propagate"> { - let summary = "Propagate XeGPU layout information"; + let summary = "Propagate and assign XeGPU layout information"; let description = [{ This pass propagates the XeGPU layout information accross ops. Starting from a set of anchor operations (e.g. `dpas`, `store_nd`), this will - propagate the layouts required for operands and results to the producers or - consumers. + propagate the layouts required for their operands to the producers. With + this propagated layout information, pass will then update the XeGPU tensor + descriptor type with the layout information. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 5ee034570ad0c..1f6ba5f1a6064 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InterleavedRange.h" #include "llvm/Support/raw_ostream.h" @@ -103,6 +104,7 @@ struct LayoutInfo { private: LaneLayout laneLayout; LaneData laneData; + xegpu::LayoutAttr layoutAttr; public: LayoutInfo() = default; @@ -186,7 +188,7 @@ struct LayoutInfoLattice : public Lattice { /// Helper Function to get the default layout for uniform values like constants. /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. -static LayoutInfo getDefaultLayoutInfo(unsigned rank) { +static LayoutInfo getDefaultSIMTLayoutInfo(unsigned rank) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), @@ -196,7 +198,7 @@ static LayoutInfo getDefaultLayoutInfo(unsigned rank) { } /// Helper to get the default layout for a vector type. -static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { +static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy) { // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && "Expected 1D or 2D vector."); @@ -205,7 +207,7 @@ static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultLayoutInfo(1); + return getDefaultSIMTLayoutInfo(1); // Packing factor is determined by the element type bitwidth. int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); @@ -221,8 +223,8 @@ static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) { /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { +static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, + unsigned operandNum) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); @@ -237,7 +239,7 @@ static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy, return LayoutInfo(layout, data); } // Otherwise, return the default layout for the vector type. - return getDefaultLayoutInfo(vectorTy); + return getDefaultSIMTLayoutInfo(vectorTy); } //===----------------------------------------------------------------------===// @@ -360,17 +362,18 @@ LogicalResult LayoutInfoPropagation::visitOperation( // All other ops. .Default([&](Operation *op) { for (const LayoutInfoLattice *r : results) { - for (LayoutInfoLattice *operand : operands) { - // Propagate the layout of the result to the operand. - if (r->getValue().isAssigned()) + if (r->getValue().isAssigned()) { + for (LayoutInfoLattice *operand : operands) { + // Propagate the layout of the result to the operand. meet(operand, *r); + } } } }); // Add a dependency from each result to program point after the operation. - for (const LayoutInfoLattice *r : results) { + for (const LayoutInfoLattice *r : results) addDependency(const_cast(r), getProgramPointAfter(op)); - } + return success(); } @@ -380,7 +383,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( // Here we assign the default layout to the tensor descriptor operand of // prefetch. auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultLayoutInfo( + auto prefetchLayout = getDefaultSIMTLayoutInfo( VectorType::get(tdescTy.getShape(), tdescTy.getElementType())); // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); @@ -395,11 +398,13 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( if (!resultLayout.isAssigned()) return; // We only consider 2D -> 1D reductions at this point. - assert(resultLayout.getLayout().size() == 1 && - "Expected 1D layout for reduction result."); + if (resultLayout.getLayout().size() != 1) { + reduction.emitWarning("Expected 1D layout for reduction result. "); + return; + } // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = getDefaultLayoutInfo(2); + LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(2); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -425,14 +430,15 @@ void LayoutInfoPropagation::visitDpasOp( ArrayRef results) { VectorType aTy = dpas.getLhsType(); VectorType bTy = dpas.getRhsType(); - propagateIfChanged(operands[0], - operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged(operands[1], - operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1))); + propagateIfChanged( + operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0))); + propagateIfChanged( + operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1))); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); - propagateIfChanged(operands[2], - operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2))); + propagateIfChanged( + operands[2], + operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2))); } } @@ -440,7 +446,7 @@ void LayoutInfoPropagation::visitDpasOp( void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef operands, ArrayRef results) { - LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType()); + LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) { propagateIfChanged(operand, operand->meet(storeLayout)); @@ -539,7 +545,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( tensorDescLayout = valueLayout.getTransposedLayout({1, 0}); } // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultLayoutInfo(1); + LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); // Propagate the new layout to the mask operand. @@ -556,7 +562,7 @@ void LayoutInfoPropagation::visitCreateDescOp( if (!descLayout.isAssigned()) return; // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultLayoutInfo(1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(1); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -575,7 +581,8 @@ void LayoutInfoPropagation::visitStoreScatterOp( "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); - LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType()); + LayoutInfo valueLayout = + getDefaultSIMTLayoutInfo(storeScatter.getValueType()); LayoutInfo storeScatterLayout = valueLayout; if (storeScatter.getTranspose()) { // StoreScatteOp allows transpose effect. However, at the stage of this @@ -590,7 +597,7 @@ void LayoutInfoPropagation::visitStoreScatterOp( // Propagate the tensor descriptor layout. propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout)); // Use default 1D layout for mask operand. - LayoutInfo maskLayout = getDefaultLayoutInfo(1); + LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); } From 0f796970a0424881f0d8bcc5e260a8462ca81f1c Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 10 Jun 2025 20:50:06 +0000 Subject: [PATCH 25/44] fix bitcast --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 25 ++++--------- mlir/test/Dialect/XeGPU/layout-propagate.mlir | 35 +++++-------------- 2 files changed, 16 insertions(+), 44 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 1f6ba5f1a6064..c8462140e8788 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -503,26 +503,15 @@ void LayoutInfoPropagation::visitVectorBitcastOp( int outElemTyBitWidth = bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - // LaneLayout does not change. - const LaneLayout &newLaneLayout = resultLayout.getLayout(); - const LaneData &currData = resultLayout.getData(); - LaneData newLaneData; - // It's a widening bitcast - if (inElemTyBitWidth < outElemTyBitWidth) { - int ratio = outElemTyBitWidth / inElemTyBitWidth; - newLaneData = resultLayout.getData()[0] == 1 - ? LaneData({1, currData[1] * ratio}) - : LaneData({currData[0] * ratio, 1}); - } else { - // It's a narrowing bitcast - int ratio = inElemTyBitWidth / outElemTyBitWidth; - newLaneData = resultLayout.getData()[0] == 1 - ? LaneData({1, currData[1] / ratio}) - : LaneData({currData[0] / ratio, 1}); + // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit a + // warning and return. + if (inElemTyBitWidth != outElemTyBitWidth) { + bitcast.emitWarning("Widening or narrowing bitcasts are not expected at " + "layout propagation stage."); + return; } - propagateIfChanged(operands[0], - operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData))); + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); } /// Propagate the layout of the result to the tensor descriptor and mask diff --git a/mlir/test/Dialect/XeGPU/layout-propagate.mlir b/mlir/test/Dialect/XeGPU/layout-propagate.mlir index f698b997e8cb7..b8f5546dd8b6b 100644 --- a/mlir/test/Dialect/XeGPU/layout-propagate.mlir +++ b/mlir/test/Dialect/XeGPU/layout-propagate.mlir @@ -148,35 +148,18 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { } // ----- -// CHECK-LABEL: func.func @vector_bitcast_i16_to_i8( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x32xi8> -func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { +// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x16xf16> +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xi16> to vector<16x16xf16> +func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16> %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8> - %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x32xi8> - %5 = xegpu.dpas %4, %3 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - xegpu.store_nd %5, %6 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> - return -} - -// ----- -// CHECK-LABEL: func.func @vector_bitcast_i8_to_f16( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x32xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x32xi8> to vector<8x16xf16> -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x32xi8> to vector<16x16xf16> -func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8> - %4 = vector.bitcast %2 : vector<8x32xi8> to vector<8x16xf16> - %5 = vector.bitcast %3 : vector<16x32xi8> to vector<16x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16> + %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x16xf16> + %5 = vector.bitcast %3 : vector<16x16xi16> to vector<16x16xf16> %6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> From 74bf971a69bff63e47cf555e685418762f069dc4 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 11 Jun 2025 20:24:24 +0000 Subject: [PATCH 26/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index c8462140e8788..ede190ca4ad44 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -398,8 +398,9 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( if (!resultLayout.isAssigned()) return; // We only consider 2D -> 1D reductions at this point. - if (resultLayout.getLayout().size() != 1) { - reduction.emitWarning("Expected 1D layout for reduction result. "); + VectorType resultTy = llvm::dyn_cast(reduction.getDestType()); + if (!resultTy || resultTy.getRank() != 1) { + reduction.emitWarning("Expecting output type to be 1D vector."); return; } // Given that the result is 1D, the layout of the operand should be 2D with @@ -679,7 +680,7 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { } } -using GetLayoutCallbackFnTy = function_ref; +using GetLayoutFnTy = function_ref; /// Helper to update the users of a value with a given layout. static void updateUsers(Value v, xegpu::LayoutAttr layout) { // Update all users of the value with the layout. @@ -697,7 +698,7 @@ static void updateUsers(Value v, xegpu::LayoutAttr layout) { /// attribute. The users of the result are also updated with the layout /// attribute. static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, - GetLayoutCallbackFnTy getLayoutOfValue) { + GetLayoutFnTy getLayoutOfValue) { // Iterate over all the results. for (OpResult result : op->getResults()) { @@ -734,7 +735,7 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, static void updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, - GetLayoutCallbackFnTy getLayoutOfValue) { + GetLayoutFnTy getLayoutOfValue) { if (!mlir::isa(terminator->getParentOp())) return; @@ -786,7 +787,7 @@ static void updateBranchTerminatorOpInterface( /// regions with the assigned layouts. static void updateBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, - GetLayoutCallbackFnTy getLayoutOfValue) { + GetLayoutFnTy getLayoutOfValue) { mlir::Operation *op = branch.getOperation(); llvm::SmallVector successors; llvm::SmallVector operands(op->getNumOperands(), nullptr); @@ -865,7 +866,7 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, /// Update the function arguments and results with the layouts. static void updateFunctionOpInterface(mlir::OpBuilder &builder, mlir::FunctionOpInterface funcOp, - GetLayoutCallbackFnTy getLayoutOfValue) { + GetLayoutFnTy getLayoutOfValue) { SmallVector newArgTypes; // Update the function arguments. for (BlockArgument arg : funcOp.getArguments()) { From d6969bc8a52bcd906f471e5e6f792bfe7db792be Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 11 Jun 2025 20:53:13 +0000 Subject: [PATCH 27/44] address comments --- mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index ede190ca4ad44..64e2271d9423b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -370,9 +370,6 @@ LogicalResult LayoutInfoPropagation::visitOperation( } } }); - // Add a dependency from each result to program point after the operation. - for (const LayoutInfoLattice *r : results) - addDependency(const_cast(r), getProgramPointAfter(op)); return success(); } From d5e4c6c55b94ccedf46c1447dc75499025a6e38e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 12 Jun 2025 00:27:59 +0000 Subject: [PATCH 28/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 35 ++++------- mlir/test/Dialect/XeGPU/layout-propagate.mlir | 60 +++++++++---------- 2 files changed, 41 insertions(+), 54 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 64e2271d9423b..8c5a0163d1a43 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -62,18 +62,12 @@ struct Layout { Layout(std::initializer_list list) : layout(list) {} void print(llvm::raw_ostream &os) const; size_t size() const { return layout.size(); } - int64_t operator[](size_t idx) const; }; void Layout::print(llvm::raw_ostream &os) const { os << llvm::interleaved_array(layout); } -int64_t Layout::operator[](size_t idx) const { - assert(idx < layout.size() && "Index out of bounds."); - return layout[idx]; -} - /// LaneLayout represents the logical layout of lanes within a subgroup when it /// accesses some value. LaneData represents the logical layout of data owned by /// each work item. @@ -679,15 +673,15 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { using GetLayoutFnTy = function_ref; /// Helper to update the users of a value with a given layout. -static void updateUsers(Value v, xegpu::LayoutAttr layout) { - // Update all users of the value with the layout. - for (OpOperand &user : v.getUses()) { - Operation *owner = user.getOwner(); - // Add temporary layout attribute at the user op. - std::string attrName = xegpu::getLayoutName(user); - owner->setAttr(attrName, layout); - } -} +// static void updateUsers(Value v, xegpu::LayoutAttr layout) { +// // Update all users of the value with the layout. +// for (OpOperand &user : v.getUses()) { +// Operation *owner = user.getOwner(); +// // Add temporary layout attribute at the user op. +// std::string attrName = xegpu::getLayoutName(user); +// owner->setAttr(attrName, layout); +// } +// } /// Update an operation with the layout of its results. If the result type is a /// vector type, a temporary layout attribute is added to the operation. If the @@ -721,9 +715,7 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, // If the result is a vector type, add a temporary layout attribute to the // op. std::string resultLayoutName = xegpu::getLayoutName(result); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - updateUsers(result, layout); + xegpu::setLayoutAttr(result, layout); } } @@ -854,9 +846,7 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, // If the result is a vector type, add a temporary layout attribute to // the op. std::string resultLayoutName = xegpu::getLayoutName(r); - op->setAttr(resultLayoutName, layout); - // Update all users of the result with the layout. - updateUsers(r, layout); + xegpu::setLayoutAttr(r, layout); } } @@ -885,9 +875,6 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, newArgTypes.back() = newTdescTy; continue; } - // If the argument is a vector type, update all the users of the argument - // with the layout. - updateUsers(arg, layout); } // Update the function type with the new argument types. // NOTE: We assume that function results are not expected to have layouts. diff --git a/mlir/test/Dialect/XeGPU/layout-propagate.mlir b/mlir/test/Dialect/XeGPU/layout-propagate.mlir index b8f5546dd8b6b..e0534fe29d377 100644 --- a/mlir/test/Dialect/XeGPU/layout-propagate.mlir +++ b/mlir/test/Dialect/XeGPU/layout-propagate.mlir @@ -7,9 +7,9 @@ // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_operand_2 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -26,7 +26,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me // ----- // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> @@ -55,7 +55,7 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x // ----- // CHECK-LABEL: func.func @vector_transpose( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> +// CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -73,8 +73,8 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % // ----- // CHECK-LABEL: func.func @extf_truncf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { -// CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf32> -// CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf32> to vector<16x16xf16> +// CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf32> +// CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf32> to vector<16x16xf16> func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -89,8 +89,8 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] {layout_operand_1 = #xegpu.layout} : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> +// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -110,8 +110,8 @@ func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: mem // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] {layout_operand_1 = #xegpu.layout} : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> +// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -124,8 +124,8 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf // ----- // CHECK-LABEL: func.func @store_scatter_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} {layout_operand_1 = #xegpu.layout} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> +// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> %cst_0 = arith.constant dense : vector<16xi1> @@ -138,7 +138,7 @@ func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { // ----- // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> +// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -150,8 +150,8 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { // ----- // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x16xf16> -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xi16> to vector<16x16xf16> +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x16xf16> +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xi16> to vector<16x16xf16> func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> @@ -171,7 +171,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> +// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -185,10 +185,10 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. // ----- // CHECK-LABEL: func.func @binary_op_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -209,13 +209,13 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { // CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> // CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: scf.yield {layout_operand_2 = #xegpu.layout} %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> -// CHECK-NEXT: } {layout_operand_5 = #xegpu.layout, layout_result_2 = #xegpu.layout} +// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> +// CHECK-NEXT: } {layout_result_2 = #xegpu.layout} // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index @@ -241,10 +241,10 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { // CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T3]] : vector<16x16xf16> +// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { // CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T4]] : vector<16x16xf16> +// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> @@ -265,10 +265,10 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { // CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T3]] : vector<16x16xf16> +// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { // CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: scf.yield {layout_operand_0 = #xegpu.layout} %[[T4]] : vector<16x16xf16> +// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> @@ -288,7 +288,7 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t // ----- // CHECK-LABEL: func.func @vector_outer_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { -// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} [0] : vector<16x16xf32> to vector<16xf32> +// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [0] : vector<16x16xf32> to vector<16xf32> func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> @@ -299,7 +299,7 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor // ----- // CHECK-LABEL: func.func @vector_inner_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { -// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} [1] : vector<16x16xf32> to vector<16xf32> +// CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [1] : vector<16x16xf32> to vector<16xf32> func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32> From 94da37e54ea094474301250d628d25104e4ff096 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 12 Jun 2025 18:28:02 +0000 Subject: [PATCH 29/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 11 ------ .../Transforms/XeGPUSubgroupDistribute.cpp | 21 +++++++++++ .../Dialect/XeGPU/subgroup-distribute.mlir | 36 +++++++++---------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 8c5a0163d1a43..a26b2e83580da 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -672,17 +672,6 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { } using GetLayoutFnTy = function_ref; -/// Helper to update the users of a value with a given layout. -// static void updateUsers(Value v, xegpu::LayoutAttr layout) { -// // Update all users of the value with the layout. -// for (OpOperand &user : v.getUses()) { -// Operation *owner = user.getOwner(); -// // Add temporary layout attribute at the user op. -// std::string attrName = xegpu::getLayoutName(user); -// owner->setAttr(attrName, layout); -// } -// } - /// Update an operation with the layout of its results. If the result type is a /// vector type, a temporary layout attribute is added to the operation. If the /// result type is a tensor descriptor type, the type is updated with the layout diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index eb8192417f843..747e01f329c03 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -812,6 +812,27 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( } void XeGPUSubgroupDistributePass::runOnOperation() { + // Attach layout to operands. + Operation *op = getOperation(); + op->walk([&](Operation *op) { + for (OpOperand &operand : op->getOpOperands()) { + // Layouts are needed for vector type only. + if (!isa(operand.get().getType())) + continue; + // If the operand already has a layout, skip it. + if (xegpu::getLayoutAttr(operand)) + continue; + + xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); + if (!layout) { + op->emitError("Could not find layout attribute for operand ") + << operand.getOperandNumber() << " of operation " << op->getName(); + signalPassFailure(); + return; + } + xegpu::setLayoutAttr(operand, layout); + } + }); // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 // operation. { diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 7362c175a70a4..fef03560dddd7 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -11,7 +11,7 @@ gpu.module @test { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %0 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -27,7 +27,7 @@ gpu.module @test { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %cst, %0 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %cst, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -47,7 +47,7 @@ gpu.module @test { %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -65,7 +65,7 @@ gpu.module @test { %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -85,9 +85,9 @@ gpu.module @test { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> - %2 = vector.extract %1[%c0] {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> + %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %2, %3 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -109,9 +109,9 @@ gpu.module @test { %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %4, %5 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -137,10 +137,10 @@ gpu.module @test { %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = math.exp %4 {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xf32> + %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = math.exp %4 {layout_result_0 = #xegpu.layout} : vector<8x16xf32> %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %5, %6 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -160,7 +160,7 @@ gpu.module @test { %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 {layout_operand_0 = #xegpu.layout} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -205,10 +205,10 @@ gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16> %6 = xegpu.create_nd_tdesc %arg1[%arg3, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> %7 = xegpu.load_nd %5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> %8 = xegpu.load_nd %6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> - %9 = xegpu.dpas %7, %8, %arg4 {layout_operand_0 = #xegpu.layout, layout_operand_1 = #xegpu.layout, layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield {layout_operand_0 = #xegpu.layout} %9 : vector<8x16xf32> - } {layout_operand_3 = #xegpu.layout, layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %2 {layout_operand_0 = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %9 : vector<8x16xf32> + } {layout_result_0 = #xegpu.layout} + xegpu.store_nd %4, %2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -227,7 +227,7 @@ gpu.module @test { %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> %1 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 {layout_operand_0 = #xegpu.layout} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %1 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -246,7 +246,7 @@ gpu.module @test { %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 {layout_operand_0 = #xegpu.layout} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> gpu.return } } From 76671e2538bfacce83ad2f594ead5b19eb0de1c4 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 12 Jun 2025 19:07:54 +0000 Subject: [PATCH 30/44] address comments --- .../Transforms/XeGPUSubgroupDistribute.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 747e01f329c03..869f99c206c96 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -812,16 +812,16 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( } void XeGPUSubgroupDistributePass::runOnOperation() { - // Attach layout to operands. + // Step 1: Attach layout to op operands. + // TODO: Following assumptions are made: + // 1) It is assumed that there are no layout conflicts. + // 2) Any existing layout attributes attached to the operands are ignored. Operation *op = getOperation(); op->walk([&](Operation *op) { for (OpOperand &operand : op->getOpOperands()) { // Layouts are needed for vector type only. if (!isa(operand.get().getType())) continue; - // If the operand already has a layout, skip it. - if (xegpu::getLayoutAttr(operand)) - continue; xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); if (!layout) { @@ -833,8 +833,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { xegpu::setLayoutAttr(operand, layout); } }); - // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0 - // operation. + // Step 2: Move all operations of a GPU function inside + // gpu.warp_execute_on_lane_0 operation. { RewritePatternSet patterns(&getContext()); patterns.add(&getContext()); @@ -853,7 +853,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { } }); } - // Apply subgroup to workitem distribution patterns. + // Step 3: Finally, Apply subgroup to workitem distribution patterns. RewritePatternSet patterns(&getContext()); xegpu::populateXeGPUSubgroupDistributePatterns(patterns); // TODO: distributionFn and shuffleFn are not used at this point. @@ -874,8 +874,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return; } - // Clean up UnrealizedConversionCastOps that were inserted due to tensor - // desc type mismatches created by using upstream distribution patterns + // Step 4: Clean up UnrealizedConversionCastOps that were inserted due to + // tensor desc type mismatches created by using upstream distribution patterns // (scf.for) getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { // We are only interested in UnrealizedConversionCastOps there were added From 32f8c799b523c1906a3334893d33587bbbd72866 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 12 Jun 2025 19:48:25 +0000 Subject: [PATCH 31/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 32 ++++++++++--------- .../Transforms/XeGPUSubgroupDistribute.cpp | 10 +++--- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index a26b2e83580da..0376d1c8c4ff4 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" @@ -341,9 +342,6 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](auto prefetchNdOp) { visitPrefetchNdOp(prefetchNdOp, operands, results); }) - // No need to propagate the layout to operands in CreateNdDescOp because - // they are scalars (offsets, sizes, etc.). - .Case([&](auto createNdDescOp) {}) .Case([&](auto transposeOp) { visitTransposeOp(transposeOp, operands, results); }) @@ -355,12 +353,18 @@ LogicalResult LayoutInfoPropagation::visitOperation( }) // All other ops. .Default([&](Operation *op) { - for (const LayoutInfoLattice *r : results) { - if (r->getValue().isAssigned()) { - for (LayoutInfoLattice *operand : operands) { - // Propagate the layout of the result to the operand. - meet(operand, *r); - } + for (const LayoutInfoLattice *resultInfo : results) { + if (!resultInfo->getValue().isAssigned()) + continue; + for (auto [operandInfo, operand] : + llvm::zip(operands, op->getOpOperands())) { + // If the operand type is not a vector or tensor descriptor, skip + // it. + if (!isa( + operand.get().getType())) + continue; + // Propagate the result layout to the operand. + meet(operandInfo, *resultInfo); } } }); @@ -456,7 +460,8 @@ void LayoutInfoPropagation::visitLoadNdOp( return; LayoutInfo tensorDescLayout = valueLayout; // LoadNdOp has the transpose effect. However, at the stage of this analysis - // this effect is not expected and should be abstracted away. Emit a warning. + // this effect is not expected and should be abstracted away. Emit a + // warning. if (auto transpose = load.getTranspose()) { load.emitWarning("Transpose effect is not expected for LoadNdOp at " "LayoutInfoPropagation stage."); @@ -495,8 +500,8 @@ void LayoutInfoPropagation::visitVectorBitcastOp( int outElemTyBitWidth = bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit a - // warning and return. + // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit + // a warning and return. if (inElemTyBitWidth != outElemTyBitWidth) { bitcast.emitWarning("Widening or narrowing bitcasts are not expected at " "layout propagation stage."); @@ -583,7 +588,6 @@ void LayoutInfoPropagation::visitStoreScatterOp( } namespace { - //===----------------------------------------------------------------------===// // RunLayoutInfoPropagation //===----------------------------------------------------------------------===// @@ -679,7 +683,6 @@ using GetLayoutFnTy = function_ref; /// attribute. static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue) { - // Iterate over all the results. for (OpResult result : op->getResults()) { Type resultType = result.getType(); @@ -872,7 +875,6 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, } namespace { - struct XeGPULayoutPropagatePass final : public xegpu::impl::XeGPULayoutPropagateBase { void runOnOperation() override; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 869f99c206c96..8b818b21ca436 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -812,7 +812,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( } void XeGPUSubgroupDistributePass::runOnOperation() { - // Step 1: Attach layout to op operands. + // Step 1: Attach layouts to op operands. // TODO: Following assumptions are made: // 1) It is assumed that there are no layout conflicts. // 2) Any existing layout attributes attached to the operands are ignored. @@ -853,7 +853,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { } }); } - // Step 3: Finally, Apply subgroup to workitem distribution patterns. + // Step 3: Apply subgroup to workitem distribution patterns. RewritePatternSet patterns(&getContext()); xegpu::populateXeGPUSubgroupDistributePatterns(patterns); // TODO: distributionFn and shuffleFn are not used at this point. @@ -874,9 +874,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return; } - // Step 4: Clean up UnrealizedConversionCastOps that were inserted due to - // tensor desc type mismatches created by using upstream distribution patterns - // (scf.for) + // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted + // due to tensor desc type mismatches created by using upstream distribution + // patterns (scf.for) getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { // We are only interested in UnrealizedConversionCastOps there were added // for resolving SIMT type mismatches. From 9cefe6fab894b903f22647c0e4f981bd1dcc8d24 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 13 Jun 2025 18:08:04 +0000 Subject: [PATCH 32/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 55 ++++++++++--------- .../Transforms/XeGPUSubgroupDistribute.cpp | 7 +-- .../Dialect/XeGPU/subgroup-distribute.mlir | 6 +- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 0376d1c8c4ff4..c36b2897e7903 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -444,9 +444,8 @@ void LayoutInfoPropagation::visitStoreNdOp( ArrayRef results) { LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); // Both operands should have the same layout - for (LayoutInfoLattice *operand : operands) { + for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); - } } /// Propagate the layout of the value to the tensor descriptor operand in @@ -659,20 +658,18 @@ RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { SmallVector funcOps; if (auto modOp = dyn_cast(target)) { - for (auto funcOp : modOp.getOps()) { + for (auto funcOp : modOp.getOps()) funcOps.push_back(funcOp); - } + // Collect all GpuFuncOps in the module. for (auto gpuModOp : modOp.getOps()) { - for (auto gpuFuncOp : gpuModOp.getOps()) { + for (auto gpuFuncOp : gpuModOp.getOps()) funcOps.push_back(gpuFuncOp); - } } } // Print the analysis result for each function. - for (FunctionOpInterface funcOp : funcOps) { + for (FunctionOpInterface funcOp : funcOps) printFunctionResult(funcOp); - } } using GetLayoutFnTy = function_ref; @@ -706,7 +703,6 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - std::string resultLayoutName = xegpu::getLayoutName(result); xegpu::setLayoutAttr(result, layout); } } @@ -717,6 +713,7 @@ static void updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, GetLayoutFnTy getLayoutOfValue) { + // Only process if the terminator is inside a region branch op. if (!mlir::isa(terminator->getParentOp())) return; @@ -729,9 +726,10 @@ static void updateBranchTerminatorOpInterface( if (!successor.isParent()) continue; - mlir::OperandRange operands = terminator.getSuccessorOperands(successor); - mlir::ValueRange inputs = successor.getSuccessorInputs(); - for (auto [operand, input] : llvm::zip(operands, inputs)) { + mlir::OperandRange forwardedOperands = + terminator.getSuccessorOperands(successor); + mlir::ValueRange regionArgs = successor.getSuccessorInputs(); + for (auto [operand, input] : llvm::zip(forwardedOperands, regionArgs)) { // print arg and inp // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n"; Type inputType = input.getType(); @@ -773,38 +771,43 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, llvm::SmallVector successors; llvm::SmallVector operands(op->getNumOperands(), nullptr); branch.getEntrySuccessorRegions(operands, successors); - DenseMap resultToLayouts; + DenseMap + resultToLayouts; // This map keeps track of layouts of any unused results + // of the branch op. mlir::ValueRange results = op->getResults(); for (mlir::RegionSuccessor &successor : successors) { + // Only interested in successor regions that are contained within the op. if (successor.isParent()) continue; - mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor); - mlir::ValueRange inputs = successor.getSuccessorInputs(); + mlir::OperandRange forwardedOperands = + branch.getEntrySuccessorOperands(successor); + mlir::ValueRange regionArgs = successor.getSuccessorInputs(); - for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) { - Type inputType = input.getType(); + for (auto [forwardedOperand, regionArg, result] : + llvm::zip(forwardedOperands, regionArgs, results)) { + Type inputType = regionArg.getType(); if (!isa(inputType)) continue; - xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); - xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); + xegpu::LayoutAttr inputLayout = getLayoutOfValue(regionArg); + xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand); if (!inputLayout || !operandLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input - << " or init arg: " << operand << "\n"); + LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << regionArg + << " or init arg: " << forwardedOperand << "\n"); continue; } // TODO: We expect these two to match. assert(inputLayout == operandLayout && - "Expexing block arg and init arg to have the same layout."); + "Expecting block arg and init arg to have the same layout."); // Get tensor descriptor type with the layout. auto tdescTy = dyn_cast(inputType); auto newTdescTy = xegpu::TensorDescType::get( tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), tdescTy.getEncoding(), inputLayout); - input.setType(newTdescTy); + regionArg.setType(newTdescTy); // Store the layout for the result. if (resultToLayouts.count(result) != 0 && resultToLayouts[result] != inputLayout) { @@ -837,7 +840,6 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, } // If the result is a vector type, add a temporary layout attribute to // the op. - std::string resultLayoutName = xegpu::getLayoutName(r); xegpu::setLayoutAttr(r, layout); } } @@ -865,7 +867,6 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); arg.setType(newTdescTy); newArgTypes.back() = newTdescTy; - continue; } } // Update the function type with the new argument types. @@ -887,9 +888,9 @@ void XeGPULayoutPropagatePass::runOnOperation() { // Helper to convert LayoutInfo to xegpu::LayoutAttr. auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analyis.getLayoutInfo(val); - if (!layout.isAssigned()) { + if (!layout.isAssigned()) return {}; - } + SmallVector laneLayout, laneData; for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), layout.getDataAsArrayRef())) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 8b818b21ca436..dc3dc70e325a3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -97,9 +97,9 @@ getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout, // dimensions are not distributed. unsigned distributionStart = originalType.getRank() - laneLayout.size(); for (auto [i, dim] : llvm::enumerate(originalType.getShape())) { - if (i < distributionStart) { + if (i < distributionStart) continue; - } + // Check if the dimension can be distributed evenly. if (dim % laneLayout[i - distributionStart] != 0) return failure(); @@ -848,9 +848,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // GPU index ops, scalar constants, etc.). This will simplify the // later lowering and avoid custom patterns for these ops. getOperation()->walk([&](Operation *op) { - if (auto warpOp = dyn_cast(op)) { + if (auto warpOp = dyn_cast(op)) vector::moveScalarUniformCode(warpOp); - } }); } // Step 3: Apply subgroup to workitem distribution patterns. diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index fef03560dddd7..a59633b0cbd9a 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -166,8 +166,8 @@ gpu.module @test { } // ----- -// TODO: gemm does not use update_nd_offset because of an issue in vector distribution. PR141853 tracks this issue. -// CHECK-LABEL: gpu.func @gemm_loop +// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution. +// CHECK-LABEL: gpu.func @gemm // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { // CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y @@ -189,7 +189,7 @@ gpu.module @test { // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @test { -gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ +gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c8 = arith.constant 8 : index From 57824d8d520258c9bc48e7ec0d0547640cea75cc Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 13 Jun 2025 21:00:22 +0000 Subject: [PATCH 33/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 174 +++++++++--------- 1 file changed, 85 insertions(+), 89 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index c36b2897e7903..b512d4c0f2878 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InterleavedRange.h" +#include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" namespace mlir { @@ -678,23 +679,23 @@ using GetLayoutFnTy = function_ref; /// result type is a tensor descriptor type, the type is updated with the layout /// attribute. The users of the result are also updated with the layout /// attribute. -static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, - GetLayoutFnTy getLayoutOfValue) { +static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, + GetLayoutFnTy getLayoutOfValue) { // Iterate over all the results. for (OpResult result : op->getResults()) { Type resultType = result.getType(); // Layouts are needed only for vector and tensor descriptor types. if (!isa(resultType)) continue; - // If the result has any users, we expect it to have a layout. + // If the result has any users, emit a warning and continue. xegpu::LayoutAttr layout = getLayoutOfValue(result); if (!layout && result.getNumUses() > 0) { - LLVM_DEBUG(DBGS() << "Expecting layout for result: " << result - << " but got none.\n"); + op->emitWarning("op has users but no layout assigned for its result"); continue; } + // If the result is a tensor descriptor type, update the tensor desc type + // with layout. if (auto tensorDescTy = dyn_cast(resultType)) { - // TODO: Handle error. auto typeWithLayout = xegpu::TensorDescType::get( tensorDescTy.getContext(), tensorDescTy.getShape(), tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); @@ -705,17 +706,18 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op, // op. xegpu::setLayoutAttr(result, layout); } + return success(); } /// Update the types of successor regions of a branch terminator op (scf.yield) /// with assigned layouts. -static void updateBranchTerminatorOpInterface( +static LogicalResult updateBranchTerminatorOpInterface( mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, GetLayoutFnTy getLayoutOfValue) { // Only process if the terminator is inside a region branch op. if (!mlir::isa(terminator->getParentOp())) - return; + return success(); llvm::SmallVector successors; llvm::SmallVector operands(terminator->getNumOperands(), @@ -729,51 +731,59 @@ static void updateBranchTerminatorOpInterface( mlir::OperandRange forwardedOperands = terminator.getSuccessorOperands(successor); mlir::ValueRange regionArgs = successor.getSuccessorInputs(); - for (auto [operand, input] : llvm::zip(forwardedOperands, regionArgs)) { - // print arg and inp - // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n"; - Type inputType = input.getType(); - if (!isa(inputType)) + for (auto [forwardedOperand, regionArg] : + llvm::zip(forwardedOperands, regionArgs)) { + Type inputType = regionArg.getType(); + // We only need to operate on tensor descriptor or vector types. + if (!isa(inputType)) continue; - xegpu::LayoutAttr inputLayout = getLayoutOfValue(input); - xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand); + xegpu::LayoutAttr argLayout = getLayoutOfValue(regionArg); + xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand); + // If either of the layouts is not assigned, we cannot proceed. if (!operandLayout) { - LLVM_DEBUG(DBGS() << "Expecting layout for region successor operand : " - << operand << " but got none.\n"); - continue; - } - - if (inputLayout && inputLayout != operandLayout) { LLVM_DEBUG( DBGS() - << "Conflicting layouts for region successor operand and input: " - << inputLayout << " vs " << operandLayout << "\n"); - continue; + << "No layout assigned for forwarded operand in branch terminator: " + << forwardedOperand << "\n"); + return failure(); + } + // We expect the layouts to match. + if (argLayout && argLayout != operandLayout) { + LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and " + "operand forwarded as the argument: " + << argLayout << " vs " << operandLayout << "\n"); + return failure(); } // Get tensor descriptor type with the layout. - auto tdescTy = dyn_cast(inputType); - auto newTdescTy = xegpu::TensorDescType::get( - tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), operandLayout); - input.setType(newTdescTy); + if (auto tdescTy = dyn_cast(inputType)) { + auto newTdescTy = xegpu::TensorDescType::get( + tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), + tdescTy.getEncoding(), operandLayout); + regionArg.setType(newTdescTy); + continue; + } + // If the type is a vector type and this region argument is an OpResult, + // set the layout attribute on the OpResult. + if (auto result = dyn_cast(regionArg)) + xegpu::setLayoutAttr(result, operandLayout); } } + return success(); } /// Some operations contain multiple regions (like scf.for) each of which have /// block arguments. This function updates the block arguments types of such -/// regions with the assigned layouts. -static void updateBranchOpInterface(mlir::OpBuilder &builder, - mlir::RegionBranchOpInterface branch, - GetLayoutFnTy getLayoutOfValue) { +/// regions with the assigned layouts. Note that results of the region op is +/// updated by the branch terminator op interface. +static LogicalResult +updateBranchOpInterface(mlir::OpBuilder &builder, + mlir::RegionBranchOpInterface branch, + GetLayoutFnTy getLayoutOfValue) { mlir::Operation *op = branch.getOperation(); llvm::SmallVector successors; llvm::SmallVector operands(op->getNumOperands(), nullptr); branch.getEntrySuccessorRegions(operands, successors); - DenseMap - resultToLayouts; // This map keeps track of layouts of any unused results - // of the branch op. mlir::ValueRange results = op->getResults(); for (mlir::RegionSuccessor &successor : successors) { @@ -788,66 +798,41 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder, for (auto [forwardedOperand, regionArg, result] : llvm::zip(forwardedOperands, regionArgs, results)) { Type inputType = regionArg.getType(); + // Only update tensor descriptor types in region args. if (!isa(inputType)) continue; - xegpu::LayoutAttr inputLayout = getLayoutOfValue(regionArg); + xegpu::LayoutAttr argLayout = getLayoutOfValue(regionArg); xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand); - if (!inputLayout || !operandLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << regionArg - << " or init arg: " << forwardedOperand << "\n"); - continue; + if (!argLayout || !operandLayout) { + LLVM_DEBUG(DBGS() << "No layout assigned for region arg: " << regionArg + << " or forwarded operand to that arg: " + << forwardedOperand << "\n"); + return failure(); } - // TODO: We expect these two to match. - assert(inputLayout == operandLayout && - "Expecting block arg and init arg to have the same layout."); + // We expect the layouts to match. + if (argLayout != operandLayout) { + LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and " + "operand forwarded as the argument: " + << argLayout << " vs " << operandLayout << "\n"); + return failure(); + } // Get tensor descriptor type with the layout. auto tdescTy = dyn_cast(inputType); auto newTdescTy = xegpu::TensorDescType::get( tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), inputLayout); + tdescTy.getEncoding(), argLayout); regionArg.setType(newTdescTy); - // Store the layout for the result. - if (resultToLayouts.count(result) != 0 && - resultToLayouts[result] != inputLayout) { - LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result - << " - " << resultToLayouts[result] << " vs " - << inputLayout << "\n"); - } else { - resultToLayouts[result] = inputLayout; - } - } - } - for (auto [i, r] : llvm::enumerate(op->getResults())) { - Type resultType = r.getType(); - if (!isa(resultType)) - continue; - xegpu::LayoutAttr layout = getLayoutOfValue(r); - if (!layout) - layout = resultToLayouts[r]; - if (!layout) { - LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result:" - << r << "\n"); - continue; - } - if (auto tensorDescTy = dyn_cast(resultType)) { - auto newTdescTy = xegpu::TensorDescType::get( - tensorDescTy.getContext(), tensorDescTy.getShape(), - tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout); - r.setType(newTdescTy); - continue; } - // If the result is a vector type, add a temporary layout attribute to - // the op. - xegpu::setLayoutAttr(r, layout); } + return success(); } /// Update the function arguments and results with the layouts. -static void updateFunctionOpInterface(mlir::OpBuilder &builder, - mlir::FunctionOpInterface funcOp, - GetLayoutFnTy getLayoutOfValue) { +static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, + mlir::FunctionOpInterface funcOp, + GetLayoutFnTy getLayoutOfValue) { SmallVector newArgTypes; // Update the function arguments. for (BlockArgument arg : funcOp.getArguments()) { @@ -859,7 +844,7 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, if (!layout) { LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg << " but got none.\n"); - continue; + return failure(); } if (auto tensorDescTy = dyn_cast(argType)) { auto newTdescTy = xegpu::TensorDescType::get( @@ -873,6 +858,7 @@ static void updateFunctionOpInterface(mlir::OpBuilder &builder, // NOTE: We assume that function results are not expected to have layouts. funcOp.setType(FunctionType::get(funcOp.getContext(), newArgTypes, funcOp.getResultTypes())); + return success(); } namespace { @@ -902,27 +888,37 @@ void XeGPULayoutPropagatePass::runOnOperation() { mlir::OpBuilder builder(&getContext()); Operation *op = getOperation(); - op->walk([&](mlir::Block *block) { + auto walkResult = op->walk([&](mlir::Block *block) -> WalkResult { for (mlir::Operation &op : llvm::reverse(block->getOperations())) { + LogicalResult r = success(); TypeSwitch(&op) .Case( [&](mlir::RegionBranchTerminatorOpInterface branchTermOp) { - updateBranchTerminatorOpInterface(builder, branchTermOp, - getXeGPULayoutForValue); + r = updateBranchTerminatorOpInterface(builder, branchTermOp, + getXeGPULayoutForValue); }) .Case( [&](mlir::RegionBranchOpInterface regionBrOp) { - updateBranchOpInterface(builder, regionBrOp, - getXeGPULayoutForValue); + r = updateBranchOpInterface(builder, regionBrOp, + getXeGPULayoutForValue); }) .Case( [&](mlir::FunctionOpInterface funcOp) { - updateFunctionOpInterface(builder, funcOp, - getXeGPULayoutForValue); + r = updateFunctionOpInterface(builder, funcOp, + getXeGPULayoutForValue); }) .Default([&](Operation *op) { - updateOp(builder, op, getXeGPULayoutForValue); + r = updateOp(builder, op, getXeGPULayoutForValue); }); + if (failed(r)) { + op.emitError("Failed to update operation with the layout."); + return WalkResult::interrupt(); + } } + return WalkResult::advance(); }); + if (walkResult.wasInterrupted()) { + signalPassFailure(); + return; + } } From ab05be9fd4c6186177a07902e6801f2249604804 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 13 Jun 2025 22:50:47 +0000 Subject: [PATCH 34/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 9 ++--- mlir/test/Dialect/XeGPU/layout-propagate.mlir | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index b512d4c0f2878..60fbc3236b9be 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -725,9 +725,6 @@ static LogicalResult updateBranchTerminatorOpInterface( terminator.getSuccessorRegions(operands, successors); for (mlir::RegionSuccessor &successor : successors) { - if (!successor.isParent()) - continue; - mlir::OperandRange forwardedOperands = terminator.getSuccessorOperands(successor); mlir::ValueRange regionArgs = successor.getSuccessorInputs(); @@ -781,12 +778,12 @@ updateBranchOpInterface(mlir::OpBuilder &builder, mlir::RegionBranchOpInterface branch, GetLayoutFnTy getLayoutOfValue) { mlir::Operation *op = branch.getOperation(); - llvm::SmallVector successors; + llvm::SmallVector entrySuccessors; llvm::SmallVector operands(op->getNumOperands(), nullptr); - branch.getEntrySuccessorRegions(operands, successors); + branch.getEntrySuccessorRegions(operands, entrySuccessors); mlir::ValueRange results = op->getResults(); - for (mlir::RegionSuccessor &successor : successors) { + for (mlir::RegionSuccessor &successor : entrySuccessors) { // Only interested in successor regions that are contained within the op. if (successor.isParent()) continue; diff --git a/mlir/test/Dialect/XeGPU/layout-propagate.mlir b/mlir/test/Dialect/XeGPU/layout-propagate.mlir index e0534fe29d377..d3b08d651deeb 100644 --- a/mlir/test/Dialect/XeGPU/layout-propagate.mlir +++ b/mlir/test/Dialect/XeGPU/layout-propagate.mlir @@ -360,3 +360,36 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> return } + +// ----- +// CHECK-LABEL: func.func @test_scf_while_and_condition( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { +// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) +// CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) { +// CHECK: scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> +// CHECK-NEXT: } do { +// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout>): +// CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> +// CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout} +func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { + %c0 = arith.constant 0 : i32 + %c16 = arith.constant 16 : i32 + %c256 = arith.constant 256 : i32 + %0 = xegpu.create_nd_tdesc %arg0[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> + + %3:3 = scf.while (%arg2 = %1, %arg3 = %c0, %arg4 = %0) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) + -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) { + %4 = arith.cmpi slt, %arg3, %c256 : i32 + scf.condition(%4) %arg2, %arg3, %arg4 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32> + } do { + ^bb0(%arg2: vector<16xf32>, %arg3: i32, %arg4: !xegpu.tensor_desc<16xf32>): + xegpu.store_nd %arg2, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + %4 = arith.addi %arg3, %c16 : i32 + %5 = xegpu.update_nd_offset %arg4, [16] : !xegpu.tensor_desc<16xf32> + %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + scf.yield %6, %4, %5 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32> + } + return +} From c4dd5a5596aaaff73ce31a2ec23265afb2de8929 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Jun 2025 21:10:29 +0000 Subject: [PATCH 35/44] address comments --- mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 60fbc3236b9be..668320736c720 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -687,7 +687,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, // Layouts are needed only for vector and tensor descriptor types. if (!isa(resultType)) continue; - // If the result has any users, emit a warning and continue. + // If the result has no layout but has users, emit a warning and continue. xegpu::LayoutAttr layout = getLayoutOfValue(result); if (!layout && result.getNumUses() > 0) { op->emitWarning("op has users but no layout assigned for its result"); @@ -867,10 +867,10 @@ struct XeGPULayoutPropagatePass final } // namespace void XeGPULayoutPropagatePass::runOnOperation() { - auto &analyis = getAnalysis(); + auto &analysis = getAnalysis(); // Helper to convert LayoutInfo to xegpu::LayoutAttr. auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { - LayoutInfo layout = analyis.getLayoutInfo(val); + LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; From 2c66eac61c23cab0cb34534225b27c4a9aa1045a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Jun 2025 21:51:17 +0000 Subject: [PATCH 36/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 668320736c720..7cf8e217f6c17 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -781,7 +781,6 @@ updateBranchOpInterface(mlir::OpBuilder &builder, llvm::SmallVector entrySuccessors; llvm::SmallVector operands(op->getNumOperands(), nullptr); branch.getEntrySuccessorRegions(operands, entrySuccessors); - mlir::ValueRange results = op->getResults(); for (mlir::RegionSuccessor &successor : entrySuccessors) { // Only interested in successor regions that are contained within the op. @@ -792,8 +791,8 @@ updateBranchOpInterface(mlir::OpBuilder &builder, branch.getEntrySuccessorOperands(successor); mlir::ValueRange regionArgs = successor.getSuccessorInputs(); - for (auto [forwardedOperand, regionArg, result] : - llvm::zip(forwardedOperands, regionArgs, results)) { + for (auto [forwardedOperand, regionArg] : + llvm::zip(forwardedOperands, regionArgs)) { Type inputType = regionArg.getType(); // Only update tensor descriptor types in region args. if (!isa(inputType)) @@ -873,14 +872,9 @@ void XeGPULayoutPropagatePass::runOnOperation() { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; - - SmallVector laneLayout, laneData; - for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(), - layout.getDataAsArrayRef())) { - laneLayout.push_back(static_cast(layout)); - laneData.push_back(static_cast(data)); - } - return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData); + return xegpu::LayoutAttr::get( + val.getContext(), llvm::to_vector_of(layout.getLayoutAsArrayRef()), + llvm::to_vector_of(layout.getDataAsArrayRef())); }; mlir::OpBuilder builder(&getContext()); From 5705d74140a645491b5934cd23d9bd9fde968ce5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 16 Jun 2025 22:51:21 +0000 Subject: [PATCH 37/44] address comments --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 129 ++++++------------ 1 file changed, 45 insertions(+), 84 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 7cf8e217f6c17..7b2d1660b0a61 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -681,6 +681,10 @@ using GetLayoutFnTy = function_ref; /// attribute. static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue) { + // Region ops (like scf.for) are already handled by the updateControlFlowOps. + if (mlir::isa(op)) + return success(); + // Iterate over all the results. for (OpResult result : op->getResults()) { Type resultType = result.getType(); @@ -709,12 +713,27 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, return success(); } -/// Update the types of successor regions of a branch terminator op (scf.yield) -/// with assigned layouts. -static LogicalResult updateBranchTerminatorOpInterface( - mlir::OpBuilder &builder, - mlir::RegionBranchTerminatorOpInterface terminator, - GetLayoutFnTy getLayoutOfValue) { +/// Update the types of successor regions at control-flow transfer points. If +/// the control flow transfers to a new block the block arguments are updated. +/// If the control flow transfers out of the region op, the result types of the +/// region op are updated. +/// Example: +/// clang-format off +/// scf.for ... iter_args(...) -> (out types) { +/// ^bb0(block types): +/// ... +/// scf.yield ... : (yield types) +/// } +/// clang-format on +/// In this example, at scf.yield, control-flow can transfer to successor +/// regions. One is the ^bb0 (for loop body) and the other is the scf.for op +/// itself (yield the results). So we update both the block arguments of the +/// successor region (i.e. block types) and the result types of the scf.for op +/// (i.e. out types). Note that yield types are updated by respective producers. +static LogicalResult +updateControlFlowOps(mlir::OpBuilder &builder, + mlir::RegionBranchTerminatorOpInterface terminator, + GetLayoutFnTy getLayoutOfValue) { // Only process if the terminator is inside a region branch op. if (!mlir::isa(terminator->getParentOp())) return success(); @@ -725,101 +744,48 @@ static LogicalResult updateBranchTerminatorOpInterface( terminator.getSuccessorRegions(operands, successors); for (mlir::RegionSuccessor &successor : successors) { - mlir::OperandRange forwardedOperands = + mlir::OperandRange successorOperands = terminator.getSuccessorOperands(successor); - mlir::ValueRange regionArgs = successor.getSuccessorInputs(); - for (auto [forwardedOperand, regionArg] : - llvm::zip(forwardedOperands, regionArgs)) { - Type inputType = regionArg.getType(); + mlir::ValueRange successorInputs = successor.getSuccessorInputs(); + for (auto [successorOperand, successorInput] : + llvm::zip(successorOperands, successorInputs)) { + Type inputType = successorInput.getType(); // We only need to operate on tensor descriptor or vector types. if (!isa(inputType)) continue; - xegpu::LayoutAttr argLayout = getLayoutOfValue(regionArg); - xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand); + xegpu::LayoutAttr successorInputLayout = getLayoutOfValue(successorInput); + xegpu::LayoutAttr successorOperandLayout = + getLayoutOfValue(successorOperand); // If either of the layouts is not assigned, we cannot proceed. - if (!operandLayout) { + if (!successorOperandLayout) { LLVM_DEBUG( DBGS() << "No layout assigned for forwarded operand in branch terminator: " - << forwardedOperand << "\n"); + << successorOperand << "\n"); return failure(); } // We expect the layouts to match. - if (argLayout && argLayout != operandLayout) { + if (successorInputLayout && + successorInputLayout != successorOperandLayout) { LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and " "operand forwarded as the argument: " - << argLayout << " vs " << operandLayout << "\n"); + << successorInputLayout << " vs " + << successorOperandLayout << "\n"); return failure(); } // Get tensor descriptor type with the layout. if (auto tdescTy = dyn_cast(inputType)) { auto newTdescTy = xegpu::TensorDescType::get( tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), operandLayout); - regionArg.setType(newTdescTy); + tdescTy.getEncoding(), successorOperandLayout); + successorInput.setType(newTdescTy); continue; } // If the type is a vector type and this region argument is an OpResult, // set the layout attribute on the OpResult. - if (auto result = dyn_cast(regionArg)) - xegpu::setLayoutAttr(result, operandLayout); - } - } - return success(); -} - -/// Some operations contain multiple regions (like scf.for) each of which have -/// block arguments. This function updates the block arguments types of such -/// regions with the assigned layouts. Note that results of the region op is -/// updated by the branch terminator op interface. -static LogicalResult -updateBranchOpInterface(mlir::OpBuilder &builder, - mlir::RegionBranchOpInterface branch, - GetLayoutFnTy getLayoutOfValue) { - mlir::Operation *op = branch.getOperation(); - llvm::SmallVector entrySuccessors; - llvm::SmallVector operands(op->getNumOperands(), nullptr); - branch.getEntrySuccessorRegions(operands, entrySuccessors); - - for (mlir::RegionSuccessor &successor : entrySuccessors) { - // Only interested in successor regions that are contained within the op. - if (successor.isParent()) - continue; - - mlir::OperandRange forwardedOperands = - branch.getEntrySuccessorOperands(successor); - mlir::ValueRange regionArgs = successor.getSuccessorInputs(); - - for (auto [forwardedOperand, regionArg] : - llvm::zip(forwardedOperands, regionArgs)) { - Type inputType = regionArg.getType(); - // Only update tensor descriptor types in region args. - if (!isa(inputType)) - continue; - xegpu::LayoutAttr argLayout = getLayoutOfValue(regionArg); - xegpu::LayoutAttr operandLayout = getLayoutOfValue(forwardedOperand); - - if (!argLayout || !operandLayout) { - LLVM_DEBUG(DBGS() << "No layout assigned for region arg: " << regionArg - << " or forwarded operand to that arg: " - << forwardedOperand << "\n"); - return failure(); - } - - // We expect the layouts to match. - if (argLayout != operandLayout) { - LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and " - "operand forwarded as the argument: " - << argLayout << " vs " << operandLayout << "\n"); - return failure(); - } - // Get tensor descriptor type with the layout. - auto tdescTy = dyn_cast(inputType); - auto newTdescTy = xegpu::TensorDescType::get( - tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(), - tdescTy.getEncoding(), argLayout); - regionArg.setType(newTdescTy); + if (auto result = dyn_cast(successorInput)) + xegpu::setLayoutAttr(result, successorOperandLayout); } } return success(); @@ -885,13 +851,8 @@ void XeGPULayoutPropagatePass::runOnOperation() { TypeSwitch(&op) .Case( [&](mlir::RegionBranchTerminatorOpInterface branchTermOp) { - r = updateBranchTerminatorOpInterface(builder, branchTermOp, - getXeGPULayoutForValue); - }) - .Case( - [&](mlir::RegionBranchOpInterface regionBrOp) { - r = updateBranchOpInterface(builder, regionBrOp, - getXeGPULayoutForValue); + r = updateControlFlowOps(builder, branchTermOp, + getXeGPULayoutForValue); }) .Case( [&](mlir::FunctionOpInterface funcOp) { From d842d3a9cc4d5b0bc9681801edbb70abf8571187 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Jun 2025 21:31:46 +0000 Subject: [PATCH 38/44] chnage pass name --- mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 6 +++--- mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp | 2 +- .../XeGPU/{layout-propagate.mlir => propagate-layout.mlir} | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename mlir/test/Dialect/XeGPU/{layout-propagate.mlir => propagate-layout.mlir} (99%) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index bf95dae69518d..eb1d384589d9d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -29,14 +29,14 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { "vector::VectorDialect"]; } -def XeGPULayoutPropagate : Pass<"xegpu-layout-propagate"> { +def XeGPULayoutPropagate : Pass<"xegpu-propagate-layout"> { let summary = "Propagate and assign XeGPU layout information"; let description = [{ This pass propagates the XeGPU layout information accross ops. Starting from a set of anchor operations (e.g. `dpas`, `store_nd`), this will propagate the layouts required for their operands to the producers. With - this propagated layout information, pass will then update the XeGPU tensor - descriptor type with the layout information. + this propagated layout information, pass will then update op result type + with the layout information. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 7b2d1660b0a61..1f1b1c106918c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -44,7 +44,7 @@ namespace xegpu { } // namespace xegpu } // namespace mlir -#define DEBUG_TYPE "xegpu-layout-propagate" +#define DEBUG_TYPE "xegpu-propagate-layout" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") using namespace mlir; diff --git a/mlir/test/Dialect/XeGPU/layout-propagate.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir similarity index 99% rename from mlir/test/Dialect/XeGPU/layout-propagate.mlir rename to mlir/test/Dialect/XeGPU/propagate-layout.mlir index d3b08d651deeb..ea55ec384beaa 100644 --- a/mlir/test/Dialect/XeGPU/layout-propagate.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -xegpu-layout-propagate -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { From f091519b1ef6a36d8bd281b6efd13dbcadedd4b5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Jun 2025 21:54:10 +0000 Subject: [PATCH 39/44] fix line breaks in test --- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 93 +++++++++++++------ 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index ea55ec384beaa..429081079de1e 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -5,9 +5,12 @@ // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { @@ -38,7 +41,8 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre // ----- // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -72,7 +76,8 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % // ----- // CHECK-LABEL: func.func @extf_truncf( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { // CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf32> // CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf32> to vector<16x16xf16> func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { @@ -87,10 +92,13 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor // ----- // CHECK-LABEL: func.func @load_gather_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} +// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> +// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{transpose}> {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -108,10 +116,13 @@ func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: mem // ----- // CHECK-LABEL: func.func @load_gather_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { -// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} +// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> +// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -124,8 +135,10 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf // ----- // CHECK-LABEL: func.func @store_scatter_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> +// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> +// CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> +// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, +// CHECK-SAME: #xegpu.layout>, vector<16xi1> func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> %cst_0 = arith.constant dense : vector<16xi1> @@ -138,7 +151,8 @@ func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { // ----- // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> +// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, +// CHECK-SAME: #xegpu.layout>, vector<16xi1> func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %cst_0 = arith.constant dense : vector<16xi1> @@ -168,9 +182,13 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- // CHECK-LABEL: func.func @binary_op_one_use( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> @@ -184,7 +202,10 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. // ----- // CHECK-LABEL: func.func @binary_op_multiple_uses( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, +// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> // CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> // CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -206,13 +227,18 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> +// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> +// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> // CHECK-NEXT: } {layout_result_2 = #xegpu.layout} // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> @@ -238,12 +264,16 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me // ----- // CHECK-LABEL: func.func @if_single_use( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { @@ -262,12 +292,17 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens // ----- // CHECK-LABEL: func.func @if_multiple_uses( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, +// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { From 0ac71623adc7a4e17b525a5323b8a41ee7a9d8dd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Jun 2025 22:24:56 +0000 Subject: [PATCH 40/44] fix comment in region ops --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index 1f1b1c106918c..a8ebad2a42d54 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -713,23 +713,28 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, return success(); } -/// Update the types of successor regions at control-flow transfer points. If -/// the control flow transfers to a new block the block arguments are updated. -/// If the control flow transfers out of the region op, the result types of the -/// region op are updated. -/// Example: +/// Region ops like scf.for need special handling because they have blocks +/// inside. If the blocks have tensor descriptor type as block arguments, thier +/// types must be updated. Also region op can have results that may not have any +/// users (e.g. A and B tiles). They are not assigned a layout by layout +/// analysis because they have no users. However inside the region op +/// corresponding block arguments for these results do have layouts. Therefore, +/// in this case we still need to update the result types with the layout +/// attribute. This function function updates the internal block arguments and +/// the result types of the region op with the assigned layouts. /// clang-format off -/// scf.for ... iter_args(...) -> (out types) { +/// Example: scf.for ... iter_args(...) -> (out types) { /// ^bb0(block types): /// ... /// scf.yield ... : (yield types) /// } /// clang-format on -/// In this example, at scf.yield, control-flow can transfer to successor +/// In this example, at scf.yield, control-flow can transfer to two successor /// regions. One is the ^bb0 (for loop body) and the other is the scf.for op /// itself (yield the results). So we update both the block arguments of the /// successor region (i.e. block types) and the result types of the scf.for op -/// (i.e. out types). Note that yield types are updated by respective producers. +/// (i.e. out types). Note that yield types are updated by respective producers +/// inside bb0. static LogicalResult updateControlFlowOps(mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, From caca184d4eb0f54935076985b668597a65c5612b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 17 Jun 2025 22:36:37 +0000 Subject: [PATCH 41/44] remove unused headers --- .../XeGPU/Transforms/XeGPULayoutPropagate.cpp | 1 - .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp index a8ebad2a42d54..196fbc7fc8891 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp @@ -19,7 +19,6 @@ #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index dc3dc70e325a3..dabcae0bfe4b1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -5,15 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h" -#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" -#include "mlir/Analysis/DataFlow/SparseAnalysis.h" -#include "mlir/Analysis/DataFlowFramework.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" @@ -39,12 +33,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/TypeSwitch.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/InterleavedRange.h" -#include "llvm/Support/LogicalResult.h" -#include "llvm/Support/raw_ostream.h" namespace mlir { namespace xegpu { From 0111b9f3a8661114643c31709094de658feaa3dc Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 18 Jun 2025 16:31:45 +0000 Subject: [PATCH 42/44] fix conflict --- .../XeGPU/subgroup-map-propagation.mlir | 622 ------------------ 1 file changed, 622 deletions(-) delete mode 100644 mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir deleted file mode 100644 index 35ac39d074c70..0000000000000 --- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir +++ /dev/null @@ -1,622 +0,0 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s - -// CHECK: function: dpas_f16: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - - -// ----- -// CHECK: function: dpas_i8: -// CHECK-NEXT: argument: of type 'vector<8x32xi8>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: argument: of type 'vector<32x16xi8>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> - %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> - return -} - -// ----- -// CHECK: function: load_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: vector_transpose: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16> - %5 = xegpu.dpas %2, %4, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: extf_truncf: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: Not assigned. -func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32> - %3 = arith.truncf %2 : vector<16x16xf32> to vector<16x16xf16> - %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - return %4 : vector<8x16xf32> -} - -// ----- -// CHECK: function: load_gather_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<256xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %cst_0 = arith.constant dense : vector<16xi1> - %2 = xegpu.create_tdesc %arg1, %cst : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr> - %3 = xegpu.load %2, %cst_0 <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: load_gather_1d: -// CHECK: argument: of type 'memref<256xf32>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { - %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %cst_0 = arith.constant dense : vector<16xi1> - %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.load %0, %cst_0 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - return -} - -// ----- -// CHECK: function: store_scatter_with_transpose_effect: -// CHECK-NEXT: argument: of type 'memref<128xf32>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> -// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1] -func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) { - %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32> - %cst_0 = arith.constant dense : vector<16xi1> - %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %0 = xegpu.create_tdesc %arg0, %cst_1 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - xegpu.store %cst, %0, %cst_0 <{transpose}> : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> - return -} - -// ----- -// CHECK: function: store_scatter_1d: -// CHECK-NEXT: argument: of type 'vector<16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: argument: of type 'memref<256xf32>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense : vector<16xi1> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { - %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %cst_0 = arith.constant dense : vector<16xi1> - %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> - return -} - -// ----- -// CHECK: function: vector_bitcast_i16_to_i8: -// CHECK-NEXT: argument: of type 'memref<8x16xi16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<32x16xi8>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xi32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8> - %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x32xi8> - %5 = xegpu.dpas %4, %3 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - xegpu.store_nd %5, %6 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> - return -} - -// ----- -// CHECK: function: vector_bitcast_i8_to_f16: -// CHECK-NEXT: argument: of type 'memref<8x32xi8>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<16x32xi8>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1] -// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8> - %4 = vector.bitcast %2 : vector<8x32xi8> to vector<8x16xf16> - %5 = vector.bitcast %3 : vector<16x32xi8> to vector<16x16xf16> - %6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: binary_op_one_use: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %2 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %3 = arith.addf %1, %2 : vector<16x16xf16> - %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: binary_op_multiple_uses: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16> - %2 = arith.addf %1, %cst : vector<16x16xf16> - %3 = xegpu.dpas %0, %2 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %3, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - return -} - -// ----- -// CHECK: function: for_op: -// CHECK-NEXT: argument: of type 'memref<8x128xf16>' at index: 0 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<128x16xf16>' at index: 1 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type 'memref<8x16xf32>' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 128 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %{{.*}} = arith.constant 16 : index -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T5:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : scf.for -// CHECK-NEXT: layout for result #0: Not assigned. -// CHECK-NEXT: layout for result #1: Not assigned. -// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c16 = arith.constant 16 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) { - %4 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %5 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16> - %8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16> - scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32> - } - %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: if_single_use: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : scf.if -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = scf.if %arg2 -> (vector<16x16xf16>) { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - scf.yield %3 : vector<16x16xf16> - } else { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - scf.yield %3 : vector<16x16xf16> - } - %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - return -} - -// ----- -// CHECK: function: if_multiple_uses: -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf16>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type 'i1' at index: 2 -// CHECK-NEXT: layout : Not assigned. -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<8x16xf32>' at index: 3 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16x16xf16>' at index: 4 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : scf.if -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = scf.if %arg2 -> (vector<16x16xf16>) { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - scf.yield %3 : vector<16x16xf16> - } else { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - scf.yield %3 : vector<16x16xf16> - } - %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> - return -} - -// ----- -// CHECK: function: vector_outer_reduction: -// CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<16xf32> - %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> - xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - return -} - -// ----- -// CHECK: function: vector_inner_reduction: -// CHECK-NEXT: argument: of type 'vector<16x16xf32>' at index: 0 -// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: argument: of type '!xegpu.tensor_desc<16xf32>' at index: 1 -// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction , %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { - %cst = arith.constant dense<0.000000e+00> : vector<16xf32> - %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32> - xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - return -} - -// ----- -// CHECK: function: update_nd_offset_1d: -// CHECK: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %1 = arith.constant dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> - %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - return -} - -// ----- -// CHECK: function: update_nd_offset_2d: -// CHECK: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -// CHECK-NEXT: op : %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> - %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> - xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> - return -} - -// ----- -// CHECK: function: prefetch_2d: -// CHECK: layout for result #0: Not assigned. -// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1] -func.func @prefetch_2d(%arg0: memref<256x256xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> - return -} - -// ----- -// CHECK: function: prefetch_1d: -// CHECK: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1] -func.func @prefetch_1d(%arg0: memref<256xf16>){ - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> - return -} From 4de7cab049b4977f961e733805e65ad98ee845bd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 18 Jun 2025 17:38:37 +0000 Subject: [PATCH 43/44] add option to print layout results --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 6 ++++- .../Dialect/XeGPU/Transforms/CMakeLists.txt | 2 +- ...Propagate.cpp => XeGPUPropagateLayout.cpp} | 23 +++++++++++++------ 3 files changed, 22 insertions(+), 9 deletions(-) rename mlir/lib/Dialect/XeGPU/Transforms/{XeGPULayoutPropagate.cpp => XeGPUPropagateLayout.cpp} (97%) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index eb1d384589d9d..3a88dae041dd1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -29,7 +29,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { "vector::VectorDialect"]; } -def XeGPULayoutPropagate : Pass<"xegpu-propagate-layout"> { +def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { let summary = "Propagate and assign XeGPU layout information"; let description = [{ This pass propagates the XeGPU layout information accross ops. Starting @@ -40,6 +40,10 @@ def XeGPULayoutPropagate : Pass<"xegpu-propagate-layout"> { }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; + let options = [Option< + "printOnly", "print-analysis-only", "bool", + /*default=*/"false", + "Print the result of layout propagation analysis and exit.">]; } def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 3b7aebfc76640..9c178d1d85642 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -4,7 +4,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUSubgroupDistribute.cpp XeGPUUnroll.cpp XeGPUWgToSgDistribute.cpp - XeGPULayoutPropagate.cpp + XeGPUPropagateLayout.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp similarity index 97% rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 196fbc7fc8891..1db19701edb16 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutPropagate.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -1,4 +1,4 @@ -//===- XeGPULayoutPropagate.cpp - XeGPU Layout Propagation ------*- C++ -*-===// +//===- XeGPUPropagateLayout.cpp - XeGPU Layout Propagation ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -38,7 +38,7 @@ namespace mlir { namespace xegpu { -#define GEN_PASS_DEF_XEGPULAYOUTPROPAGATE +#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" } // namespace xegpu } // namespace mlir @@ -622,8 +622,7 @@ LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) { } // Print the analysis result for debugging purposes. -[[maybe_unused]] void -RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { +void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { auto printFunctionResult = [&](FunctionOpInterface funcOp) { os << "function: " << funcOp.getName() << ":\n"; // Function arguments @@ -828,15 +827,25 @@ static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, } namespace { -struct XeGPULayoutPropagatePass final - : public xegpu::impl::XeGPULayoutPropagateBase { +struct XeGPUPropagateLayoutPass final + : public xegpu::impl::XeGPUPropagateLayoutBase { + XeGPUPropagateLayoutPass() = default; + XeGPUPropagateLayoutPass(const XeGPUPropagateLayoutPass &other) = default; + XeGPUPropagateLayoutPass(xegpu::XeGPUPropagateLayoutOptions options) + : XeGPUPropagateLayoutBase(options) {} void runOnOperation() override; }; } // namespace -void XeGPULayoutPropagatePass::runOnOperation() { +void XeGPUPropagateLayoutPass::runOnOperation() { auto &analysis = getAnalysis(); + // Print the analysis result and exit. (for debugging purposes) + if (printOnly) { + auto &os = llvm::outs(); + analysis.printAnalysisResult(os); + return; + } // Helper to convert LayoutInfo to xegpu::LayoutAttr. auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { LayoutInfo layout = analysis.getLayoutInfo(val); From 3a26509304258aebb7a9832f4db5e0635ebc6951 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 20 Jun 2025 17:04:58 +0000 Subject: [PATCH 44/44] fix conflict --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 1db19701edb16..cc22d2bbd8c39 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -9,6 +9,7 @@ #include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h" #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/SparseAnalysis.h" +#include "mlir/Analysis/DataFlow/Utils.h" #include "mlir/Analysis/DataFlowFramework.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -598,8 +599,7 @@ class RunLayoutInfoPropagation { RunLayoutInfoPropagation(Operation *op) : target(op) { SymbolTableCollection symbolTable; - solver.load(); - solver.load(); + loadBaselineAnalyses(solver); solver.load(symbolTable); (void)solver.initializeAndRun(op); }