Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> {
to a hardware instruction.
}];
let dependentDialects = [
"memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
];
"memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect",
"index::IndexDialect"];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does it use index dialect at all in the end?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is used in addElementwise, so the pass needs to load it.

}

#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
7 changes: 6 additions & 1 deletion mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class OpResult;
class OpBuilder;
class ValueRange;
class TypeConverter;
class OpFoldResult;

namespace xegpu {
class DistributeLayoutAttr;
Expand Down Expand Up @@ -143,6 +144,11 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
/// if no GPU module parent or XeVM target attribute exists.
std::optional<std::string> getChipStr(Operation *op);

/// Generates element-wise addition ops of two arrays with same length.
SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
Expand All @@ -156,7 +162,6 @@ std::optional<std::string> getChipStr(Operation *op);
SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);

} // namespace xegpu

} // namespace mlir
Expand Down
5 changes: 3 additions & 2 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "mlir/Dialect/XeGPU/Transforms/Passes.h"

#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
Expand Down Expand Up @@ -157,10 +158,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
xegpu::UpdateOffsetOp>(op))
xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
xegpu::LoadGatherOp>(op))
xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op))
return getTileShape(op->getOpOperand(0));
if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
return getTileShape(op->getOpOperand(1));
Expand Down
87 changes: 82 additions & 5 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
}
};

struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
PatternRewriter &rewriter) const override {
Location loc = op.getLoc();
VectorType valueTy = op.getType();
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: static_cast

You might also need to check that original shape is divisible by the target

return failure();

Type elemTy = valueTy.getElementType();
ArrayRef<int64_t> shape = valueTy.getShape();
auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());

VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);

SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
auto adds = xegpu::addElementwise(
rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
}

SmallVector<Value> newOps;
layout = layout.dropInstData();
for (SmallVector<OpFoldResult> offsets : offsetsList) {
auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
op.getLoc(), newValueTy, op.getMemDesc(), offsets, layout);
newOps.push_back(newOp);
}
Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
rewriter.replaceOp(op, castOp);
return success();
}
};

struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
using UnrollPattern<xegpu::StoreMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op,
PatternRewriter &rewriter) const override {
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
if (!targetShape)
return failure();

Location loc = op.getLoc();
VectorType valueTy = op.getData().getType();
ArrayRef<int64_t> shape = valueTy.getShape();
auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());

SmallVector<Type> convertedValTypes =
getUnrolledTypes(valueTy, *targetShape);
SmallVector<Value> convertedValues =
pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter);

SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
auto adds = xegpu::addElementwise(
rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
}

for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList))
rewriter.create<xegpu::StoreMatrixOp>(loc, v, op.getMemDesc(), offsets,
layout.dropInstData());

rewriter.eraseOp(op);
return success();
}
};

} // namespace

void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
options);
patterns
.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp>(
patterns.getContext(), options);
}
40 changes: 34 additions & 6 deletions mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getDistributeLayoutAttr(loadNd.getTensorDesc());

// for LoadMatrixOp, the layout is attached to the property of the op
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
return loadOp.getLayoutAttr();

// for StoreMatrixOp, the layout is attached to the property of the op
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
return storeOp.getLayoutAttr();

std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
Expand All @@ -154,6 +162,13 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();

if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
return loadOp.getLayoutAttr();

if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
return storeOp.getLayoutAttr();

std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
Expand Down Expand Up @@ -182,6 +197,9 @@ template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
void xegpu::setDistributeLayoutAttrs(
Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
return;

for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setDistributeLayoutAttr(opr, layout);
Expand Down Expand Up @@ -429,6 +447,21 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {
return std::nullopt;
}

/// Generates element-wise addition ops of two arrays with same length.
SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs) {
assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
SmallVector<OpFoldResult> results;
for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
}
return results;
}

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
Expand All @@ -448,11 +481,6 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
a = a.slice(a.size() - b.size());
for (auto [l, r] : llvm::zip(a, b)) {
auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
}
results.append(addElementwise(builder, loc, a, b));
return results;
return {};
}
23 changes: 23 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -561,3 +561,26 @@ gpu.module @test_kernel {
gpu.return %e : vector<8x32x2xf16>
}
}

// -----
gpu.module @test_kernel {
//CHECK-LABEL: unroll_load_matrix
gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> {
%0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
//CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
//CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
%1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
gpu.return %1: vector<32x32xf32>
}
}

// -----
gpu.module @test_kernel {
// CHECK-LABEL: unroll_store_matrix
gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
%mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
// CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
gpu.return
}
}
27 changes: 27 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,33 @@ gpu.module @test_1_1_assignment {
gpu.return
}

// CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
// CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32>
gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
//CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
//CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
//CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
//CHECK: [[C32:%.+]] = arith.constant 32 : index
//CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
//CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_2:%.+]] = arith.constant 0 : index
//CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
//CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
//CHECK: [[C256:%.+]] = arith.constant 256 : index
//CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
//CHECK: [[C128:%.+]] = arith.constant 128 : index
//CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
//CHECK: [[C0_3:%.+]] = arith.constant 0 : index
//CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
//CHECK: [[C0_4:%.+]] = arith.constant 0 : index
//CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
//CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}

// CHECK-LABEL: load_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
Expand Down