Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -921,21 +921,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
let summary = "Convert the layout of the input operand";
let description = [{
`convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
lowered to WI level because that is the end result of all distributions.
`convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to
the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
the IR is lowered to WI level because that is the end result of all distributions.
}];
let arguments = (ins XeGPU_Vector2DType: $source,
XeGPU_LayoutAttr: $srcMap,
XeGPU_LayoutAttr: $resMap
);
let results = (outs XeGPU_Vector2DType: $result);
let arguments = (ins XeGPU_VectorType: $source,
XeGPU_LayoutAttr: $input_layout,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please consider add/improve a test case for 2d+ shape

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add one with 3D shape.

XeGPU_LayoutAttr: $target_layout);
let results = (outs XeGPU_VectorType: $result);
let assemblyFormat = [{
$source attr-dict `:` type($source)
$source prop-dict attr-dict `:` type($source)
}];

let hasFolder = 1;
let hasVerifier = 1;
let hasCanonicalizer = 1;
}

#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
2 changes: 1 addition & 1 deletion mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
def XeGPU_Vector2DType: FixedVectorOfRankAndType<[2], [XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;

// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
Expand Down
61 changes: 42 additions & 19 deletions mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -607,32 +607,55 @@ LogicalResult DpasOp::verify() {
// XeGPU_ConvertLayoutOp
//===----------------------------------------------------------------------===//
LogicalResult ConvertLayoutOp::verify() {
auto srcMap = getSrcMapAttr();
auto resMap = getResMapAttr();
if (!srcMap)
return emitOpError("expected srcMap.");
if (!resMap)
return emitOpError("expected resMap.");

if (srcMap == resMap)
return emitOpError("expected different srcMap and resMap.");

// both srcMap and resMap should be WgLayout or SgLayout at the same time.
if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
(!srcMap.isSgLayout() || !resMap.isSgLayout()))
return emitOpError(
"expected srcMap and resMap be WgLayout or SgLayout at the same time.");
auto srcLayout = getInputLayout();
auto resLayout = getTargetLayout();
if (!srcLayout)
return emitOpError("expected input layout.");
if (!resLayout)
return emitOpError("expected target layout.");

// both input and target layouts should be WgLayout or SgLayout at the same
// time.
if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
(!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
return emitOpError("expected input layout and target layout be WgLayout or "
"SgLayout at the same time.");

auto shape = getSource().getType().getShape();
if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap))
return emitOpError("invalid srcMap, data cannot be evenly distributed.");
if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
return emitOpError(
"invalid input layout, data cannot be evenly distributed.");

if (!XeGPUDialect::isEvenlyDistributable(shape, resMap))
return emitOpError("invalid resMap, data cannot be evenly distributed.");
if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
return emitOpError(
"invalid target layout, data cannot be evenly distributed.");

return mlir::success();
}

OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
if (getInputLayout() == getTargetLayout())
return getSource();
return {};
}

struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
if (op.getInputLayout() == op.getTargetLayout()) {
rewriter.replaceOp(op, op.getSource());
return success();
}
return failure();
}
};

void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
MLIRContext *context) {
patterns.add<FoldConvertLayoutOp>(context);
}

} // namespace xegpu
} // namespace mlir

Expand Down
24 changes: 24 additions & 0 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,29 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}

// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
// layout attributes. Since both producer and consumer operations handle data
// partitioning based on their own inst_data, while maintaining original input
// and output shape, ConvertLayoutOp does not need to manage inst_data.
struct ConvertLayoutOpPattern
: public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
Comment on lines +83 to +86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add some comment here explaining why inst_data must be dropped?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, added.

PatternRewriter &rewriter) const override {
xegpu::LayoutAttr input_layout = op.getInputLayoutAttr();
xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr();
if (!input_layout.getInstData() || !target_layout.getInstData())
return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");

input_layout = input_layout.dropInstData();
target_layout = target_layout.dropInstData();
auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
rewriter.replaceOp(op, newOp);
return success();
}
};

//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
Expand Down Expand Up @@ -331,6 +354,7 @@ void XeGPUBlockingPass::runOnOperation() {
});

RewritePatternSet patterns(ctx);
patterns.add<ConvertLayoutOpPattern>(ctx);

vector::UnrollVectorOptions vectorOptions;
vectorOptions.setNativeShapeFn(options.nativeShape);
Expand Down
82 changes: 78 additions & 4 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;

// Calculate offset for each subgroup
SmallVector<OpFoldResult>
static SmallVector<OpFoldResult>
calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
const SmallVector<OpFoldResult> &originalOffsets,
const SmallVector<Value> &localOffset,
const SmallVector<int64_t> &distUnitBaseAddr,
const SmallVector<int64_t> &distUnitShape) const {
const SmallVector<int64_t> &distUnitShape) {
assert(localOffset.size() == distUnitBaseAddr.size() &&
"localOffset and distUnitBaseAddr must have the same rank");

Expand Down Expand Up @@ -392,6 +392,75 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};

// clang-format off
// Pattern for lowering ConvertLayoutOp based on sg_layout and sg_data.
// If input_layout and target_layout have identical sg_layout and sg_data,
// the op is rewritten to a subgroup-level ConvertLayoutOp with these fields
// dropped. For example:
// #a = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>
// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>
// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
// becomes:
// #a = #xegpu.layout<inst_data = [16, 16]>
// #b = #xegpu.layout<inst_data = [8, 16]>
// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<16x16xf32>
// (vector<16x16xf32> is determined by sg_data = [16, 16])
//
// If sg_layout or sg_data differ, SLM is used to redistribute data across subgroups.
// For example:
// #a = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 16], inst_data = [16, 16]>
// #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 32], inst_data = [8, 16]>
// xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32>
// is lowered to:
// #a = #xegpu.layout<inst_data = [16, 16]>
// #b = #xegpu.layout<inst_data = [8, 16]>
// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, metrix_desc<32x64xf32>
// %d = load_matrix %slm <{layout_result_0 = #a}> : metrix_desc<32x64xf32> -> vector<16x32xf32>
// xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32>
// clang-format on
struct WgToSgConvertLayoutOp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add IR example in the format From to To which comments.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I got your point. Here we are only interested in sgLayout and sgData fields. The rest fields, e.g., inst_data could be different, could be different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I improved comments with examples.

: public OpConversionPattern<xegpu::ConvertLayoutOp> {
Comment on lines +495 to +496
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment about the pattern.

using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
LogicalResult
matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
xegpu::LayoutAttr input = op.getInputLayout();
xegpu::LayoutAttr target = op.getTargetLayout();

if (!input || !target || !input.isWgLayout() || !target.isWgLayout())
return rewriter.notifyMatchFailure(
op, "Input and target layouts must have subgroup layout");

DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
DenseI32ArrayAttr inputSgData = input.getSgData();
DenseI32ArrayAttr inputOrder = input.getOrder();
DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
DenseI32ArrayAttr targetSgData = target.getSgData();
DenseI32ArrayAttr targetOrder = target.getOrder();

// TODO: currently we only support for optimal case, where input and
// output has the same sg_layout and sg_data, so SLM is not involved.
if (inputSgLayout != targetSgLayout || inputSgData != targetSgData ||
inputOrder != targetOrder)
return failure();

input = input.dropSgLayoutAndData();
target = target.dropSgLayoutAndData();

SmallVector<Value> newOps(adaptor.getSource());
if (input && target) {
// keep the ConvertLayoutOp for rest fields, e.g., inst_data.
for (auto [i, src] : llvm::enumerate(adaptor.getSource())) {
auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these will be folded away If I am not mistaken? So in that case why not directly forward the sources without creating new ConvertOps?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. they are not folded. ConvertLayoutOp will be folded only when InputLayout and TargetLayout are the same. So here input and target could be different.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bit confused here. line 415 says otherwise. If they are different we bail out early.

Please add some comments to clarify the logic here.

op.getLoc(), src.getType(), src, input, target);
newOps[i] = newOp;
}
}
rewriter.replaceOpWithMultiple(op, {newOps});
return success();
}
};

// Handles UnrealizedConversionCastOp generated during
// SCFStructuralTypeConversions (step 1). This op may appear as either a
// target or source materialization for Vector values, e.g.:
Expand Down Expand Up @@ -475,8 +544,8 @@ namespace xegpu {
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
patterns.getContext());
UnrealizedConversionCastOpPattern, WgToSgElementwiseOp,
WgToSgConvertLayoutOp>(patterns.getContext());
}
} // namespace xegpu
} // namespace mlir
Expand Down Expand Up @@ -583,6 +652,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return isLegal(layout);
});

target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a commet on the condition for legality?

[=](xegpu::ConvertLayoutOp op) -> bool {
return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
});

target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
[=](Operation *op) -> std::optional<bool> {
// Only handle elementwise mappable ops
Expand Down
7 changes: 6 additions & 1 deletion mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");

// For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
return convertOp.getTargetLayoutAttr();

// for LoadNdOp, the layout is stored in the tensor descriptor
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
Expand All @@ -137,7 +141,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
auto parentOp = arg.getOwner()->getParentOp();
if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
return getLayoutAttr(tiedInit->get());
if (tiedInit)
return getLayoutAttr(tiedInit->get());
}
}

Expand Down
14 changes: 3 additions & 11 deletions mlir/test/Dialect/XeGPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -508,19 +508,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
return
}

// -----
func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
// expected-error@+1 {{expected different srcMap and resMap}}
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}

// -----
func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
// expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
// expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

Expand Down
12 changes: 8 additions & 4 deletions mlir/test/Dialect/XeGPU/layout.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}

gpu.func @convert_layout(%a: vector<32x64xf16>) {
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
// CHECK: xegpu.convert_layout
// CHECK-SAME: <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
// CHECK: xegpu.convert_layout
// CHECK-SAME: <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have any conversion test cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we have one

%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

Expand Down
Loading
Loading