Skip to content

Commit b842f33

Browse files
committed
fix
1 parent 07f9f9f commit b842f33

File tree

1 file changed

+99
-101
lines changed

1 file changed

+99
-101
lines changed

mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp

Lines changed: 99 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -31,121 +31,119 @@ using namespace mlir;
3131
namespace {
3232
bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
3333

34-
// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
35-
// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
36-
// /// The warp op will still contain the original op that will not be used by
37-
// the
38-
// /// yield op (and should be cleaned up later with dce). The yield op will
39-
// bypass
40-
// /// the create_nd_tdesc's arguments.
41-
// /// The rewrite will create a subview of the size used by a single work item
42-
// and
43-
// /// appropriate offset. The distributed create_nd_tdesc points into the
44-
// subview
45-
// /// without offset. The tensor descriptor types is distributed according to
46-
// /// sg_map attribute.
47-
// ///
48-
// /// Example:
49-
// ///
50-
// /// ```
51-
// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
52-
// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
53-
// /// (!xegpu.tensor_desc<4x8xf32>) {
54-
// /// ...
55-
// /// %td = xegpu.create_nd_tdesc %arg0[0, 0]
56-
// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
57-
// /// vector.yield %td
58-
// /// }
59-
// /// ```
60-
// /// To
61-
// /// ```
62-
// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
63-
// /// ...
64-
// /// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
65-
// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
66-
// /// vector.yield %arg0, %dead
67-
// /// }
68-
// /// %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
69-
// /// : memref<4x8xf32> to memref<4x1xf32>
70-
// /// %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
71-
// /// -> !xegpu.tensor_desc<4x1xf32>
72-
// ///
73-
// /// ```
34+
/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
35+
/// `vector.warp_execute_on_lane_0` and put it after the warp op.
36+
/// The warp op will still contain the original op that will not be used by
37+
/// the
38+
/// yield op (and should be cleaned up later with dce). The yield op will
39+
/// bypass
40+
/// the create_nd_tdesc's arguments.
41+
/// The rewrite will create a subview of the size used by a single work item
42+
/// and
43+
/// appropriate offset. The distributed create_nd_tdesc points into the
44+
/// subview
45+
/// without offset. The tensor descriptor types is distributed according to
46+
/// sg_map attribute.
47+
///
48+
/// Example:
49+
///
50+
/// ```
51+
/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
52+
/// %r = vector.warp_execute_on_lane_0(%laneid) ->
53+
/// (!xegpu.tensor_desc<4x8xf32>) {
54+
/// ...
55+
/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
56+
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
57+
/// vector.yield %td
58+
/// }
59+
/// ```
60+
/// To
61+
/// ```
62+
/// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
63+
/// ...
64+
/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
65+
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
66+
/// vector.yield %arg0, %dead
67+
/// }
68+
/// %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
69+
/// : memref<4x8xf32> to memref<4x1xf32>
70+
/// %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
71+
/// -> !xegpu.tensor_desc<4x1xf32>
72+
///
73+
/// ```
7474
struct WarpOpTensorDescOp final
7575
: public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
7676
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
7777
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
7878
PatternRewriter &rewriter) const override;
7979
};
8080

81-
// /// Sink a store_nd feeding into vector.yield op for the enclosing
82-
// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are
83-
// passed
84-
// /// through the warp op interface they would be propagated as returned
85-
// values.
86-
// /// Both the stored vector type and tensor descriptor types are distributed
87-
// /// according to sg_map attribute.
88-
// ///
89-
// /// Example:
90-
// ///
91-
// /// ```
92-
// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
93-
// /// vector.warp_execute_on_lane_0(%laneid) -> () {
94-
// /// ...
95-
// /// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
96-
// /// !xegpu.tensor_desc<4x8xf32>
97-
// /// vector.yield
98-
// /// }
99-
// /// ```
100-
// /// To
101-
// /// ```
102-
// /// %r = vector.warp_execute_on_lane_0(%laneid) -> () {
103-
// /// ...
104-
// /// vector.yield
105-
// /// }
106-
// /// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
107-
// !xegpu.tensor_desc<4x1xf32>
108-
// ///
109-
// /// ```
81+
/// Sink a store_nd feeding into vector.yield op for the enclosing
82+
/// `vector.warp_execute_on_lane_0`. In case arguments for the store are
83+
/// passed
84+
/// through the warp op interface they would be propagated as returned
85+
/// values.
86+
/// Both the stored vector type and tensor descriptor types are distributed
87+
/// according to sg_map attribute.
88+
///
89+
/// Example:
90+
///
91+
/// ```
92+
/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
93+
/// vector.warp_execute_on_lane_0(%laneid) -> () {
94+
/// ...
95+
/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
96+
/// !xegpu.tensor_desc<4x8xf32>
97+
/// vector.yield
98+
/// }
99+
/// ```
100+
/// To
101+
/// ```
102+
/// %r = vector.warp_execute_on_lane_0(%laneid) -> () {
103+
/// ...
104+
/// vector.yield
105+
/// }
106+
/// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
107+
/// !xegpu.tensor_desc<4x1xf32>
108+
///
109+
/// ```
110110
struct WarpOpStoreNd final
111111
: public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
112112
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
113113
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
114114
PatternRewriter &rewriter) const override;
115115
};
116116

117-
// /// Clone a load_nd feeding into vector.yield op for the enclosing
118-
// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
119-
// /// The warp op will still contain the original op that will not be used by
120-
// the
121-
// /// yield op (and should be cleaned up later with dce). The yield op will
122-
// bypass
123-
// /// the load's arguments.
124-
// /// Both the loaded vector type and tensor descriptor types are distributed
125-
// /// according to sg_map attribute.
126-
// ///
127-
// /// Example:
128-
// ///
129-
// /// ```
130-
// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
131-
// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
132-
// /// (!xegpu.tensor_desc<4x8xf32>) {
133-
// /// ...
134-
// /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
135-
// /// vector<4x8xf32> vector.yield %ld
136-
// /// }
137-
// /// ```
138-
// /// To
139-
// /// ```
140-
// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
141-
// /// ...
142-
// /// %dead = xegpu.load_nd %arg0, %arg1:
143-
// /// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
144-
// /// vector.yield %arg0, %arg1
145-
// /// }
146-
// /// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
147-
// ///
148-
// /// ```
117+
/// Clone a load_nd feeding into vector.yield op for the enclosing
118+
/// `vector.warp_execute_on_lane_0` and put it after the warp op.
119+
/// The warp op will still contain the original op that will not be used by
120+
/// the yield op (and should be cleaned up later with dce). The yield op will
121+
/// bypass the load's arguments. Both the loaded vector type and tensor
122+
/// descriptor types are distributed according to sg_map attribute.
123+
///
124+
/// Example:
125+
///
126+
/// ```
127+
/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
128+
/// %r = vector.warp_execute_on_lane_0(%laneid) ->
129+
/// (!xegpu.tensor_desc<4x8xf32>) {
130+
/// ...
131+
/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
132+
/// vector<4x8xf32> vector.yield %ld
133+
/// }
134+
/// ```
135+
/// To
136+
/// ```
137+
/// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
138+
/// ...
139+
/// %dead = xegpu.load_nd %arg0, %arg1:
140+
/// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
141+
/// vector.yield %arg0, %arg1
142+
/// }
143+
/// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
144+
/// !xegpu.tensor_desc<4x1xf32>
145+
///
146+
/// ```
149147
struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
150148
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
151149
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,

0 commit comments

Comments
 (0)