Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> {
to a hardware instruction.
}];
let dependentDialects = [
"memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
];
"memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect",
"index::IndexDialect"];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does it use index dialect at all in the end?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is used in addElementwise, so the pass needs to load it.

}

#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
16 changes: 16 additions & 0 deletions mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_

#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/OpDefinition.h"
namespace mlir {

class VectorType;
Expand All @@ -18,6 +19,7 @@ class OpResult;
class OpBuilder;
class ValueRange;
class TypeConverter;
class OpFoldResult;

namespace xegpu {
class LayoutAttr;
Expand Down Expand Up @@ -128,6 +130,20 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
/// if no GPU module parent or XeVM target attribute exists.
std::optional<std::string> getChipStr(Operation *op);

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
/// the longer array are preserved unchanged. This is commonly used for offset
/// computation where higher-dimensional offsets need to be added to
/// lower-dimensional adjustments.
///
/// Example:
/// lhs = [l1, l2, l3], rhs = [r1, r2]
/// Result: [11, l2+r1, l3+r2]
SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);

} // namespace xegpu

} // namespace mlir
Expand Down
12 changes: 7 additions & 5 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "mlir/Dialect/XeGPU/Transforms/Passes.h"

#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
Expand Down Expand Up @@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
xegpu::UpdateOffsetOp>(op))
xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
xegpu::LoadGatherOp>(op))
xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op))
return getTileShape(op->getOpOperand(0));
if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
return getTileShape(op->getOpOperand(1));
Expand Down Expand Up @@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {

bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
// skip the op if any of its operands or results has workgroup level layouts
bool hasWgLayoutOperands =
bool hasSgLayoutOperands =
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr);
return layout && layout.isWgLayout();
});
bool hasWgLayoutResults =
bool hasSgLayoutResults =
llvm::any_of(op->getOpResults(), [](OpResult result) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
return layout && layout.isWgLayout();
});
if (hasWgLayoutOperands || hasWgLayoutResults) {

if (hasSgLayoutOperands || hasSgLayoutResults) {
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
return false;
}
Expand Down
87 changes: 82 additions & 5 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
}
};

struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
PatternRewriter &rewriter) const override {
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
if (!targetShape)
return failure();

Location loc = op.getLoc();
VectorType valueTy = op.getType();
Type elemTy = valueTy.getElementType();
ArrayRef<int64_t> shape = valueTy.getShape();
auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());

VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);

SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
auto adds = xegpu::addWithRightAligned(
rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
}

SmallVector<Value> newOps;
for (SmallVector<OpFoldResult> offsets : offsetsList) {
auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
op.getLoc(), newValueTy, op.getMemDesc(), offsets,
layout.dropInstData());
newOps.push_back(newOp);
}
Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
rewriter.replaceOp(op, castOp);
return success();
}
};

struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
using UnrollPattern<xegpu::StoreMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op,
PatternRewriter &rewriter) const override {
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
if (!targetShape)
return failure();

Location loc = op.getLoc();
VectorType valueTy = op.getData().getType();
ArrayRef<int64_t> shape = valueTy.getShape();
auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());

SmallVector<Type> convertedValTypes =
getUnrolledTypes(valueTy, *targetShape);
SmallVector<Value> convertedValues =
pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter);

SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
auto adds = xegpu::addWithRightAligned(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to use addWithRightAligned here? The op's offsets should have always the same number as the distributed offsets (out from shape/targetshape).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is motivated by the old style (offset in createNd), in which creating a low rank tensor desc (2D) from a high rank tensor desc (e.g., 4D) was allowed. In such case, the local offset for the tensor desc is with lower rank, which needs to be added with the original high rank offsets to get the final one. It is created to make it compatible with these test cases. I am also not sure whether this support will be removed or not. It can be refactored after we completely switch to the new style.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this applies to load_matrix/store_matrix: we should restrict the shapes size are always 2D - for both matrix_desc and vector.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, it's already guaranteed to be 2D by matrix ops verifiers.
It should be fine as is but if you can use a different helper/approach for better readability, that'd be nice too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addWithRightAligned was supposed to handle both cases. But I think your suggestions are right. So I introduced addElementwise here.

rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
}

for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList))
rewriter.create<xegpu::StoreMatrixOp>(loc, v, op.getMemDesc(), offsets,
layout.dropInstData());

rewriter.eraseOp(op);
return success();
}
};

} // namespace

void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
options);
patterns
.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp>(
patterns.getContext(), options);
}
46 changes: 46 additions & 0 deletions mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
Expand Down Expand Up @@ -133,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());

// for LoadMatrixOp, the layout is attached to the property of the op
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());

// for StoreMatrixOp, the layout is attached to the property of the op
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());

std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
Expand All @@ -152,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {

xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();

if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());

if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());

std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::LayoutAttr>(layoutName);
Expand Down Expand Up @@ -179,6 +195,8 @@ xegpu::setLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand,
void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
return;
for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setLayoutAttr(opr, layout);
Expand Down Expand Up @@ -424,3 +442,31 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {

return std::nullopt;
}

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
/// the longer array are preserved unchanged. This is commonly used for offset
/// computation where higher-dimensional offsets need to be added to
/// lower-dimensional adjustments.
///
/// Example:
/// lhs = [l1, l2, l3], rhs = [r1, r2]
/// Result: [11, l2+r1, l3+r2]
SmallVector<OpFoldResult>
xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs) {
// ensure a is longer than b
ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
a = a.slice(a.size() - b.size());
for (auto [l, r] : llvm::zip(a, b)) {
auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
}
return results;
return {};
}
23 changes: 23 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -561,3 +561,26 @@ gpu.module @test_kernel {
gpu.return %e : vector<8x32x2xf16>
}
}

// -----
gpu.module @test_kernel {
//CHECK-LABEL: unroll_load_matrix
gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> {
%0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
//CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
//CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
%1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
gpu.return %1: vector<32x32xf32>
}
}

// -----
gpu.module @test_kernel {
// CHECK-LABEL: unroll_store_matrix
gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
%mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
// CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
gpu.return
}
}