-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp #146176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
2e0f4db
9e89e72
149aeea
aee53c4
c416cec
65b5dbd
ec4e7ad
b9c02fc
d8035af
4dd0c0c
da7f78a
0b42c3b
cad98ad
4568657
ff72cb5
cf59d7a
8e79477
727fc0b
e1b9d1e
9ec1f9e
4377d63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,6 +76,25 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { | |
| } | ||
| } | ||
|
|
||
| struct ConvertLayoutOpPattern | ||
| : public OpRewritePattern<xegpu::ConvertLayoutOp> { | ||
| using OpRewritePattern::OpRewritePattern; | ||
| LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, | ||
|
Comment on lines
+83
to
+86
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add some comment here explaining why inst_data must be dropped?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, added. |
||
| PatternRewriter &rewriter) const override { | ||
| xegpu::LayoutAttr input_layout = op.getInputLayoutAttr(); | ||
| xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr(); | ||
| if (!input_layout.getInstData() || !target_layout.getInstData()) | ||
| return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); | ||
|
|
||
| input_layout = input_layout.dropInstData(); | ||
| target_layout = target_layout.dropInstData(); | ||
| auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>( | ||
| op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); | ||
| rewriter.replaceOp(op, newOp); | ||
| return success(); | ||
| } | ||
| }; | ||
|
|
||
| //===------------------------------------------------------------------------===// | ||
| // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops | ||
| // to partition operations that process large shapes into multiple operations on | ||
|
|
@@ -331,6 +350,7 @@ void XeGPUBlockingPass::runOnOperation() { | |
| }); | ||
|
|
||
| RewritePatternSet patterns(ctx); | ||
| patterns.add<ConvertLayoutOpPattern>(ctx); | ||
|
|
||
| vector::UnrollVectorOptions vectorOptions; | ||
| vectorOptions.setNativeShapeFn(options.nativeShape); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> { | |
| using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern; | ||
|
|
||
| // Calculate offset for each subgroup | ||
| SmallVector<OpFoldResult> | ||
| static SmallVector<OpFoldResult> | ||
| calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, | ||
| const SmallVector<OpFoldResult> &originalOffsets, | ||
| const SmallVector<Value> &localOffset, | ||
| const SmallVector<int64_t> &distUnitBaseAddr, | ||
| const SmallVector<int64_t> &distUnitShape) const { | ||
| const SmallVector<int64_t> &distUnitShape) { | ||
| assert(localOffset.size() == distUnitBaseAddr.size() && | ||
| "localOffset and distUnitBaseAddr must have the same rank"); | ||
|
|
||
|
|
@@ -392,6 +392,46 @@ struct WgToSgElementwiseOp : public ConversionPattern { | |
| } | ||
| }; | ||
|
|
||
| struct WgToSgConvertLayoutOp | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add IR example in the format
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I got your point. Here we are only interested in sgLayout and sgData fields. The rest fields, e.g.,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I improved comments with examples. |
||
| : public OpConversionPattern<xegpu::ConvertLayoutOp> { | ||
|
||
| using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern; | ||
| LogicalResult | ||
| matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, | ||
| ConversionPatternRewriter &rewriter) const override { | ||
| xegpu::LayoutAttr input = op.getInputLayout(); | ||
| xegpu::LayoutAttr target = op.getTargetLayout(); | ||
|
|
||
| if (!input || !target || !input.isWgLayout() || !target.isWgLayout()) | ||
| return rewriter.notifyMatchFailure( | ||
| op, "Input and target layouts must have subgroup layout"); | ||
|
|
||
| DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); | ||
| DenseI32ArrayAttr inputSgData = input.getSgData(); | ||
| DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); | ||
| DenseI32ArrayAttr targetSgData = target.getSgData(); | ||
|
|
||
| // TODO: currently we only support for optimal case, where input and | ||
| // output has the same sg_layout and sg_data, so SLM is not involved. | ||
| if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) | ||
|
||
| return failure(); | ||
|
||
|
|
||
| input = input.dropSgLayoutAndData(); | ||
| target = target.dropSgLayoutAndData(); | ||
|
|
||
| SmallVector<Value> newOps(adaptor.getSource()); | ||
|
|
||
| if (input && target) { | ||
| for (auto [i, src] : llvm::enumerate(adaptor.getSource())) { | ||
| auto newOp = rewriter.create<xegpu::ConvertLayoutOp>( | ||
|
||
| op.getLoc(), src.getType(), src, input, target); | ||
| newOps[i] = newOp; | ||
| } | ||
| } | ||
| rewriter.replaceOpWithMultiple(op, {newOps}); | ||
| return success(); | ||
| } | ||
| }; | ||
|
|
||
| // Handles UnrealizedConversionCastOp generated during | ||
| // SCFStructuralTypeConversions (step 1). This op may appear as either a | ||
| // target or source materialization for Vector values, e.g.: | ||
|
|
@@ -475,8 +515,8 @@ namespace xegpu { | |
| void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { | ||
| patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp, | ||
| WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp, | ||
| UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>( | ||
| patterns.getContext()); | ||
| UnrealizedConversionCastOpPattern, WgToSgElementwiseOp, | ||
| WgToSgConvertLayoutOp>(patterns.getContext()); | ||
| } | ||
| } // namespace xegpu | ||
| } // namespace mlir | ||
|
|
@@ -583,6 +623,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { | |
| return isLegal(layout); | ||
| }); | ||
|
|
||
| target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a commet on the condition for legality? |
||
| [=](xegpu::ConvertLayoutOp op) -> bool { | ||
| return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); | ||
| }); | ||
|
|
||
| target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>( | ||
| [=](Operation *op) -> std::optional<bool> { | ||
| // Only handle elementwise mappable ops | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,14 +35,18 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { | |
| } | ||
|
|
||
| gpu.func @convert_layout(%a: vector<32x64xf16>) { | ||
| %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, | ||
| resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16> | ||
| // CHECK: xegpu.convert_layout | ||
| // CHECK-SAME: <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, | ||
| target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| gpu.return | ||
| } | ||
|
|
||
| gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { | ||
| %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, | ||
| resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16> | ||
| // CHECK: xegpu.convert_layout | ||
| // CHECK-SAME: <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have any conversion test cases?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we have one |
||
| %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>, | ||
| target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16> | ||
| gpu.return | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -500,3 +500,41 @@ gpu.module @test_kernel { | |
| gpu.return | ||
| } | ||
| } | ||
|
|
||
| // ----- | ||
| #a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]> | ||
| #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]> | ||
| #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]> | ||
|
|
||
| gpu.module @test_kernel { | ||
| //CHECK-LABEL: gpu.func @convert_layout | ||
| //CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32> | ||
| //CHECK: [[c8:%.+]] = arith.constant 8 : index | ||
| //CHECK: [[c0:%.+]] = arith.constant 0 : index | ||
| //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> | ||
| //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> | ||
| //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16> | ||
| //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16> | ||
| //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think this convertOp should also be folded away when we improve the logic, right? So maybe better to use a diferent source/target layout for it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. they are actually different, the input is [16, 1], and the output is [8, 1] for lane_data. |
||
| //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> | ||
| //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> | ||
| //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> | ||
| //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> | ||
| //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> | ||
| //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> | ||
| //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> | ||
| //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>> | ||
|
|
||
| gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) { | ||
| %c0 = arith.constant 0 : index | ||
| %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> | ||
| %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> | ||
| %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> | ||
| %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> | ||
| %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> | ||
| %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> | ||
| %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> | ||
| xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> | ||
| gpu.return | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please consider add/improve a test case for 2d+ shape
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add one with 3D shape.