@@ -31,121 +31,119 @@ using namespace mlir;
3131namespace {
3232bool divisible (APInt lhs, APInt rhs) { return !lhs.urem (rhs); }
3333
34- // // / Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
35- // // / `vector.warp_execute_on_lane_0` and put it after the warp op.
36- // // / The warp op will still contain the original op that will not be used by
37- // the
38- // // / yield op (and should be cleaned up later with dce). The yield op will
39- // bypass
40- // // / the create_nd_tdesc's arguments.
41- // // / The rewrite will create a subview of the size used by a single work item
42- // and
43- // // / appropriate offset. The distributed create_nd_tdesc points into the
44- // subview
45- // // / without offset. The tensor descriptor types is distributed according to
46- // // / sg_map attribute.
47- // // /
48- // // / Example:
49- // // /
50- // // / ```
51- // // / #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
52- // // / %r = vector.warp_execute_on_lane_0(%laneid) ->
53- // // / (!xegpu.tensor_desc<4x8xf32>) {
54- // // / ...
55- // // / %td = xegpu.create_nd_tdesc %arg0[0, 0]
56- // // / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
57- // // / vector.yield %td
58- // // / }
59- // // / ```
60- // // / To
61- // // / ```
62- // // / %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
63- // // / ...
64- // // / %dead = xegpu.create_nd_tdesc %arg0[0, 0]
65- // // / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
66- // // / vector.yield %arg0, %dead
67- // // / }
68- // // / %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
69- // // / : memref<4x8xf32> to memref<4x1xf32>
70- // // / %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
71- // // / -> !xegpu.tensor_desc<4x1xf32>
72- // // /
73- // // / ```
34+ // / Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
35+ // / `vector.warp_execute_on_lane_0` and put it after the warp op.
36+ // / The warp op will still contain the original op that will not be used by
37+ // / the
38+ // / yield op (and should be cleaned up later with dce). The yield op will
39+ // / bypass
40+ // / the create_nd_tdesc's arguments.
41+ // / The rewrite will create a subview of the size used by a single work item
42+ // / and
43+ // / appropriate offset. The distributed create_nd_tdesc points into the
44+ // / subview
45+ // / without offset. The tensor descriptor types is distributed according to
46+ // / sg_map attribute.
47+ // /
48+ // / Example:
49+ // /
50+ // / ```
51+ // / #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
52+ // / %r = vector.warp_execute_on_lane_0(%laneid) ->
53+ // / (!xegpu.tensor_desc<4x8xf32>) {
54+ // / ...
55+ // / %td = xegpu.create_nd_tdesc %arg0[0, 0]
56+ // / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
57+ // / vector.yield %td
58+ // / }
59+ // / ```
60+ // / To
61+ // / ```
62+ // / %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
63+ // / ...
64+ // / %dead = xegpu.create_nd_tdesc %arg0[0, 0]
65+ // / : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
66+ // / vector.yield %arg0, %dead
67+ // / }
68+ // / %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
69+ // / : memref<4x8xf32> to memref<4x1xf32>
70+ // / %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
71+ // / -> !xegpu.tensor_desc<4x1xf32>
72+ // /
73+ // / ```
7474struct WarpOpTensorDescOp final
7575 : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
7676 using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
7777 LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op warpOp,
7878 PatternRewriter &rewriter) const override ;
7979};
8080
81- // // / Sink a store_nd feeding into vector.yield op for the enclosing
82- // // / `vector.warp_execute_on_lane_0`. In case arguments for the store are
83- // passed
84- // // / through the warp op interface they would be propagated as returned
85- // values.
86- // // / Both the stored vector type and tensor descriptor types are distributed
87- // // / according to sg_map attribute.
88- // // /
89- // // / Example:
90- // // /
91- // // / ```
92- // // / #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
93- // // / vector.warp_execute_on_lane_0(%laneid) -> () {
94- // // / ...
95- // // / xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
96- // // / !xegpu.tensor_desc<4x8xf32>
97- // // / vector.yield
98- // // / }
99- // // / ```
100- // // / To
101- // // / ```
102- // // / %r = vector.warp_execute_on_lane_0(%laneid) -> () {
103- // // / ...
104- // // / vector.yield
105- // // / }
106- // // / xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
107- // !xegpu.tensor_desc<4x1xf32>
108- // // /
109- // // / ```
81+ // / Sink a store_nd feeding into vector.yield op for the enclosing
82+ // / `vector.warp_execute_on_lane_0`. In case arguments for the store are
83+ // / passed
84+ // / through the warp op interface they would be propagated as returned
85+ // / values.
86+ // / Both the stored vector type and tensor descriptor types are distributed
87+ // / according to sg_map attribute.
88+ // /
89+ // / Example:
90+ // /
91+ // / ```
92+ // / #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
93+ // / vector.warp_execute_on_lane_0(%laneid) -> () {
94+ // / ...
95+ // / xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
96+ // / !xegpu.tensor_desc<4x8xf32>
97+ // / vector.yield
98+ // / }
99+ // / ```
100+ // / To
101+ // / ```
102+ // / %r = vector.warp_execute_on_lane_0(%laneid) -> () {
103+ // / ...
104+ // / vector.yield
105+ // / }
106+ // / xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
107+ // / !xegpu.tensor_desc<4x1xf32>
108+ // /
109+ // / ```
110110struct WarpOpStoreNd final
111111 : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
112112 using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
113113 LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op warpOp,
114114 PatternRewriter &rewriter) const override ;
115115};
116116
117- // /// Clone a load_nd feeding into vector.yield op for the enclosing
118- // /// `vector.warp_execute_on_lane_0` and put it after the warp op.
119- // /// The warp op will still contain the original op that will not be used by
120- // the
121- // /// yield op (and should be cleaned up later with dce). The yield op will
122- // bypass
123- // /// the load's arguments.
124- // /// Both the loaded vector type and tensor descriptor types are distributed
125- // /// according to sg_map attribute.
126- // ///
127- // /// Example:
128- // ///
129- // /// ```
130- // /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
131- // /// %r = vector.warp_execute_on_lane_0(%laneid) ->
132- // /// (!xegpu.tensor_desc<4x8xf32>) {
133- // /// ...
134- // /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
135- // /// vector<4x8xf32> vector.yield %ld
136- // /// }
137- // /// ```
138- // /// To
139- // /// ```
140- // /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
141- // /// ...
142- // /// %dead = xegpu.load_nd %arg0, %arg1:
143- // /// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
144- // /// vector.yield %arg0, %arg1
145- // /// }
146- // /// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
147- // ///
148- // /// ```
117+ // / Clone a load_nd feeding into vector.yield op for the enclosing
118+ // / `vector.warp_execute_on_lane_0` and put it after the warp op.
119+ // / The warp op will still contain the original op that will not be used by
120+ // / the yield op (and should be cleaned up later with dce). The yield op will
121+ // / bypass the load's arguments. Both the loaded vector type and tensor
122+ // / descriptor types are distributed according to sg_map attribute.
123+ // /
124+ // / Example:
125+ // /
126+ // / ```
127+ // / #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
128+ // / %r = vector.warp_execute_on_lane_0(%laneid) ->
129+ // / (!xegpu.tensor_desc<4x8xf32>) {
130+ // / ...
131+ // / %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
132+ // / vector<4x8xf32> vector.yield %ld
133+ // / }
134+ // / ```
135+ // / To
136+ // / ```
137+ // / %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
138+ // / ...
139+ // / %dead = xegpu.load_nd %arg0, %arg1:
140+ // / !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
141+ // / vector.yield %arg0, %arg1
142+ // / }
143+ // / xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
144+ // / !xegpu.tensor_desc<4x1xf32>
145+ // /
146+ // / ```
149147struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
150148 using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
151149 LogicalResult matchAndRewrite (gpu::WarpExecuteOnLane0Op warpOp,
0 commit comments