-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp #146176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
2e0f4db
9e89e72
149aeea
aee53c4
c416cec
65b5dbd
ec4e7ad
b9c02fc
d8035af
4dd0c0c
da7f78a
0b42c3b
cad98ad
4568657
ff72cb5
cf59d7a
8e79477
727fc0b
e1b9d1e
9ec1f9e
4377d63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,6 +76,29 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { | |
| } | ||
| } | ||
|
|
||
| // This pattern lowers ConvertLayoutOp by removing the inst_data field from the | ||
| // layout attributes. Since both producer and consumer operations handle data | ||
| // partitioning based on their own inst_data, while maintaining original input | ||
| // and output shape, ConvertLayoutOp does not need to manage inst_data. | ||
| struct ConvertLayoutOpPattern | ||
| : public OpRewritePattern<xegpu::ConvertLayoutOp> { | ||
| using OpRewritePattern::OpRewritePattern; | ||
| LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, | ||
|
Comment on lines
+83
to
+86
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add some comment here explaining why inst_data must be dropped?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, added. |
||
| PatternRewriter &rewriter) const override { | ||
| xegpu::LayoutAttr input_layout = op.getInputLayoutAttr(); | ||
| xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr(); | ||
| if (!input_layout.getInstData() || !target_layout.getInstData()) | ||
| return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); | ||
|
|
||
| input_layout = input_layout.dropInstData(); | ||
| target_layout = target_layout.dropInstData(); | ||
| auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>( | ||
| op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); | ||
| rewriter.replaceOp(op, newOp); | ||
| return success(); | ||
| } | ||
| }; | ||
|
|
||
| //===------------------------------------------------------------------------===// | ||
| // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops | ||
| // to partition operations that process large shapes into multiple operations on | ||
|
|
@@ -331,6 +354,7 @@ void XeGPUBlockingPass::runOnOperation() { | |
| }); | ||
|
|
||
| RewritePatternSet patterns(ctx); | ||
| patterns.add<ConvertLayoutOpPattern>(ctx); | ||
|
|
||
| vector::UnrollVectorOptions vectorOptions; | ||
| vectorOptions.setNativeShapeFn(options.nativeShape); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> { | |
| using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern; | ||
|
|
||
| // Calculate offset for each subgroup | ||
| SmallVector<OpFoldResult> | ||
| static SmallVector<OpFoldResult> | ||
| calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, | ||
| const SmallVector<OpFoldResult> &originalOffsets, | ||
| const SmallVector<Value> &localOffset, | ||
| const SmallVector<int64_t> &distUnitBaseAddr, | ||
| const SmallVector<int64_t> &distUnitShape) const { | ||
| const SmallVector<int64_t> &distUnitShape) { | ||
| assert(localOffset.size() == distUnitBaseAddr.size() && | ||
| "localOffset and distUnitBaseAddr must have the same rank"); | ||
|
|
||
|
|
@@ -392,6 +392,75 @@ struct WgToSgElementwiseOp : public ConversionPattern { | |
| } | ||
| }; | ||
|
|
||
| // clang-format off | ||
| // Pattern for lowering ConvertLayoutOp based on sg_layout and sg_data. | ||
| // If input_layout and target_layout have identical sg_layout and sg_data, | ||
| // the op is rewritten to a subgroup-level ConvertLayoutOp with these fields | ||
| // dropped. For example: | ||
| // #a = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]> | ||
| // #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]> | ||
| // xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32> | ||
| // becomes: | ||
| // #a = #xegpu.layout<inst_data = [16, 16]> | ||
| // #b = #xegpu.layout<inst_data = [8, 16]> | ||
| // xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<16x16xf32> | ||
| // (vector<16x16xf32> is determined by sg_data = [16, 16]) | ||
| // | ||
| // If sg_layout or sg_data differ, SLM is used to redistribute data across subgroups. | ||
| // For example: | ||
| // #a = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 16], inst_data = [16, 16]> | ||
| // #b = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 32], inst_data = [8, 16]> | ||
| // xegpu.convert_layout %1 <{input_layout = #a, target_layout = #b}> : vector<32x64xf32> | ||
| // is lowered to: | ||
| // #a = #xegpu.layout<inst_data = [16, 16]> | ||
| // #b = #xegpu.layout<inst_data = [8, 16]> | ||
| // store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, metrix_desc<32x64xf32> | ||
| // %d = load_matrix %slm <{layout_result_0 = #a}> : metrix_desc<32x64xf32> -> vector<16x32xf32> | ||
chencha3 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32> | ||
| // clang-format on | ||
| struct WgToSgConvertLayoutOp | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add IR example in the format
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I got your point. Here we are only interested in sgLayout and sgData fields. The rest fields, e.g.,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I improved comments with examples. |
||
| : public OpConversionPattern<xegpu::ConvertLayoutOp> { | ||
|
||
| using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern; | ||
| LogicalResult | ||
| matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, | ||
| ConversionPatternRewriter &rewriter) const override { | ||
| xegpu::LayoutAttr input = op.getInputLayout(); | ||
| xegpu::LayoutAttr target = op.getTargetLayout(); | ||
|
|
||
| if (!input || !target || !input.isWgLayout() || !target.isWgLayout()) | ||
| return rewriter.notifyMatchFailure( | ||
| op, "Input and target layouts must have subgroup layout"); | ||
|
|
||
| DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); | ||
| DenseI32ArrayAttr inputSgData = input.getSgData(); | ||
| DenseI32ArrayAttr inputOrder = input.getOrder(); | ||
| DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); | ||
| DenseI32ArrayAttr targetSgData = target.getSgData(); | ||
| DenseI32ArrayAttr targetOrder = target.getOrder(); | ||
|
|
||
| // TODO: currently we only support for optimal case, where input and | ||
| // output has the same sg_layout and sg_data, so SLM is not involved. | ||
| if (inputSgLayout != targetSgLayout || inputSgData != targetSgData || | ||
| inputOrder != targetOrder) | ||
| return failure(); | ||
|
|
||
| input = input.dropSgLayoutAndData(); | ||
| target = target.dropSgLayoutAndData(); | ||
|
|
||
| SmallVector<Value> newOps(adaptor.getSource()); | ||
| if (input && target) { | ||
| // keep the ConvertLayoutOp for rest fields, e.g., inst_data. | ||
| for (auto [i, src] : llvm::enumerate(adaptor.getSource())) { | ||
| auto newOp = rewriter.create<xegpu::ConvertLayoutOp>( | ||
|
||
| op.getLoc(), src.getType(), src, input, target); | ||
| newOps[i] = newOp; | ||
| } | ||
| } | ||
| rewriter.replaceOpWithMultiple(op, {newOps}); | ||
| return success(); | ||
| } | ||
| }; | ||
|
|
||
| // Handles UnrealizedConversionCastOp generated during | ||
| // SCFStructuralTypeConversions (step 1). This op may appear as either a | ||
| // target or source materialization for Vector values, e.g.: | ||
|
|
@@ -475,8 +544,8 @@ namespace xegpu { | |
| void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { | ||
| patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp, | ||
| WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp, | ||
| UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>( | ||
| patterns.getContext()); | ||
| UnrealizedConversionCastOpPattern, WgToSgElementwiseOp, | ||
| WgToSgConvertLayoutOp>(patterns.getContext()); | ||
| } | ||
| } // namespace xegpu | ||
| } // namespace mlir | ||
|
|
@@ -583,6 +652,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { | |
| return isLegal(layout); | ||
| }); | ||
|
|
||
| target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a commet on the condition for legality? |
||
| [=](xegpu::ConvertLayoutOp op) -> bool { | ||
| return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); | ||
| }); | ||
|
|
||
| target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>( | ||
| [=](Operation *op) -> std::optional<bool> { | ||
| // Only handle elementwise mappable ops | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,14 +35,18 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { | |
| } | ||
|
|
||
| gpu.func @convert_layout(%a: vector<32x64xf16>) { | ||
| %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, | ||
| resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16> | ||
| // CHECK: xegpu.convert_layout | ||
| // CHECK-SAME: <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, | ||
| target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| gpu.return | ||
| } | ||
|
|
||
| gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { | ||
| %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, | ||
| resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16> | ||
| // CHECK: xegpu.convert_layout | ||
| // CHECK-SAME: <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have any conversion test cases?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we have one |
||
| %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, | ||
| target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| gpu.return | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please consider add/improve a test case for 2d+ shape
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add one with 3D shape.