@@ -7,15 +7,29 @@ gpu.module @test_round_robin_assignment {
77 // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
88 // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
99 // CHECK-NOT: xegpu.create_nd_tdesc
10- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
10+ %cst0 = arith.constant 0 : index
11+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <256 x128 xf32 >
12+ -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
13+ gpu.return
14+ }
15+
16+ // CHECK-LABEL: create_nd_tdesc_no_offset
17+ // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
18+ gpu.func @create_nd_tdesc_no_offset (%src: memref <256 x128 xf32 >) {
19+ // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][0, 0] : memref<256x128xf32>
20+ // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
21+ // CHECK-NOT: xegpu.create_nd_tdesc
22+ %cst0 = arith.constant 0 : index
23+ %tdesc = xegpu.create_nd_tdesc %src: memref <256 x128 xf32 >
1124 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
1225 gpu.return
1326 }
1427
1528 // CHECK-LABEL: load_nd_tdesc
1629 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
1730 gpu.func @load_nd_tdesc (%src: memref <256 x128 xf32 >) {
18- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
31+ %cst0 = arith.constant 0 : index
32+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <256 x128 xf32 >
1933 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
2034 // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
2135 // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -30,7 +44,8 @@ gpu.module @test_round_robin_assignment {
3044 // CHECK-LABEL: store_nd
3145 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
3246 gpu.func @store_nd (%src: memref <256 x128 xf32 >) {
33- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
47+ %cst0 = arith.constant 0 : index
48+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <256 x128 xf32 >
3449 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
3550 // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
3651 // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -46,7 +61,8 @@ gpu.module @test_round_robin_assignment {
4661 // CHECK-LABEL: update_nd
4762 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
4863 gpu.func @update_nd (%src: memref <256 x128 xf32 >){
49- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
64+ %cst0 = arith.constant 0 : index
65+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <256 x128 xf32 >
5066 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
5167 // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
5268 // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>>
@@ -69,12 +85,13 @@ gpu.module @test_round_robin_assignment {
6985 // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
7086 // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
7187 // CHECK-NOT: xegpu.dpas
72- %tdesc_a = xegpu.create_nd_tdesc %a [0 , 0 ] : memref <256 x128 xf16 >
88+ %cst0 = arith.constant 0 : index
89+ %tdesc_a = xegpu.create_nd_tdesc %a [%cst0 , %cst0 ] : memref <256 x128 xf16 >
7390 -> !xegpu.tensor_desc <256 x128 xf16 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
7491 %load_a = xegpu.load_nd %tdesc_a
7592 : !xegpu.tensor_desc <256 x128 xf16 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
7693 -> vector <256 x128 xf16 >
77- %tdesc_b = xegpu.create_nd_tdesc %b [0 , 0 ] : memref <128 x256 xf16 >
94+ %tdesc_b = xegpu.create_nd_tdesc %b [%cst0 , %cst0 ] : memref <128 x256 xf16 >
7895 -> !xegpu.tensor_desc <128 x256 xf16 , #xegpu.layout <sg_layout = [4 , 8 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>>
7996 %load_b = xegpu.load_nd %tdesc_b
8097 : !xegpu.tensor_desc <128 x256 xf16 , #xegpu.layout <sg_layout = [4 , 8 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>>
@@ -91,7 +108,8 @@ gpu.module @test_round_robin_assignment {
91108 // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
92109 // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
93110 // CHECK-NOT: xegpu.prefetch_nd
94- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
111+ %cst0 = arith.constant 0 : index
112+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <256 x128 xf32 >
95113 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
96114 xegpu.prefetch_nd %tdesc
97115 : !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -101,7 +119,8 @@ gpu.module @test_round_robin_assignment {
101119 // CHECK-LABEL: broadcast
102120 // CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32>
103121 gpu.func @broadcast (%src: memref <128 x1 xf32 >) {
104- %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <128 x1 xf32 >
122+ %cst0 = arith.constant 0 : index
123+ %tdesc = xegpu.create_nd_tdesc %src [%cst0 , %cst0 ] : memref <128 x1 xf32 >
105124 -> !xegpu.tensor_desc <128 x1 xf32 , #xegpu.layout <sg_layout = [4 , 1 ], sg_data = [16 , 1 ], lane_layout = [8 , 1 ], lane_data = [1 , 1 ]>>
106125 %load = xegpu.load_nd %tdesc
107126 : !xegpu.tensor_desc <128 x1 xf32 , #xegpu.layout <sg_layout = [4 , 1 ], sg_data = [16 , 1 ], lane_layout = [8 , 1 ], lane_data = [1 , 1 ]>>
@@ -122,8 +141,8 @@ gpu.module @test_round_robin_assignment {
122141 %c0 = arith.constant 0 : index
123142 %c256 = arith.constant 256 : index
124143 %c1024 = arith.constant 1024 : index
125- %0 = xegpu.create_nd_tdesc %arg0 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
126- %1 = xegpu.create_nd_tdesc %arg1 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
144+ %0 = xegpu.create_nd_tdesc %arg0 [%c0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
145+ %1 = xegpu.create_nd_tdesc %arg1 [%c0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
127146 // CHECK-LABEL: scf.for
128147 // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
129148 %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args (%arg3 = %0 , %arg4 = %1 )
@@ -143,9 +162,10 @@ gpu.module @test_round_robin_assignment {
143162 %c1_i32 = arith.constant 1 : i32
144163 %c10_i32 = arith.constant 10 : i32
145164 %c0_i32 = arith.constant 0 : i32
146- %0 = xegpu.create_nd_tdesc %arg0 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
165+ %cst0 = arith.constant 0 : index
166+ %0 = xegpu.create_nd_tdesc %arg0 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
147167 %1 = xegpu.load_nd %0 : !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>> -> vector <256 xf32 >
148- %2 = xegpu.create_nd_tdesc %arg1 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
168+ %2 = xegpu.create_nd_tdesc %arg1 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
149169 //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
150170 %3:2 = scf.while (%arg2 = %1 , %arg3 = %c0_i32 ) : (vector <256 xf32 >, i32 ) -> (vector <256 xf32 >, i32 ) {
151171 %4 = arith.cmpi slt , %arg3 , %c10_i32 : i32
@@ -164,10 +184,11 @@ gpu.module @test_round_robin_assignment {
164184 }
165185
166186 gpu.func @scf_if (%arg0: memref <1024 xf32 >, %arg1: memref <1024 xf32 >) {
187+ %cst0 = arith.constant 0 : index
167188 %c10 = arith.constant 10 : index
168189 %0 = gpu.subgroup_id : index
169- %1 = xegpu.create_nd_tdesc %arg0 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
170- %2 = xegpu.create_nd_tdesc %arg1 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
190+ %1 = xegpu.create_nd_tdesc %arg0 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
191+ %2 = xegpu.create_nd_tdesc %arg1 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
171192 %3 = arith.cmpi eq , %0 , %c10 : index
172193 // CHECK-LABEL: scf.if
173194 // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
@@ -189,20 +210,20 @@ gpu.module @test_round_robin_assignment {
189210 gpu.func @scf_if_tensor_desc (%arg0: memref <1024 xf32 >, %arg1: memref <1024 xf32 >) {
190211 %c10 = arith.constant 10 : index
191212 %id = gpu.subgroup_id : index
192-
193- %t = xegpu.create_nd_tdesc %arg0 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
213+ %cst0 = arith.constant 0 : index
214+ %t = xegpu.create_nd_tdesc %arg0 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
194215 %d = xegpu.load_nd %t : !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>> -> vector <256 xf32 >
195216
196217 %0 = arith.cmpi eq , %id , %c10 : index
197218 // CHECK-LABEL: scf.if
198219 // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
199220 %1 = scf.if %0 -> (!xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>) {
200- %2 = xegpu.create_nd_tdesc %arg0 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
221+ %2 = xegpu.create_nd_tdesc %arg0 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
201222 // CHECK-LABEL: scf.yield
202223 // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
203224 scf.yield %2 : !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
204225 } else {
205- %3 = xegpu.create_nd_tdesc %arg1 [0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
226+ %3 = xegpu.create_nd_tdesc %arg1 [%cst0 ] : memref <1024 xf32 > -> !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
206227 // CHECK-LABEL: scf.yield
207228 // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
208229 scf.yield %3 : !xegpu.tensor_desc <256 xf32 , #xegpu.layout <sg_layout = [8 ], sg_data = [16 ]>>
@@ -212,7 +233,8 @@ gpu.module @test_round_robin_assignment {
212233 }
213234
214235 gpu.func @convert_layout_optimal (%arg0: memref <32 x64 xf32 >) {
215- %0 = xegpu.create_nd_tdesc %arg0 [0 , 0 ] : memref <32 x64 xf32 > -> !xegpu.tensor_desc <32 x64 xf32 , #xegpu.layout <sg_layout = [2 , 2 ], sg_data = [16 , 16 ], inst_data = [16 , 16 ]>>
236+ %cst0 = arith.constant 0 : index
237+ %0 = xegpu.create_nd_tdesc %arg0 [%cst0 , %cst0 ] : memref <32 x64 xf32 > -> !xegpu.tensor_desc <32 x64 xf32 , #xegpu.layout <sg_layout = [2 , 2 ], sg_data = [16 , 16 ], inst_data = [16 , 16 ]>>
216238 //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
217239 //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
218240 %1 = xegpu.load_nd %0 : !xegpu.tensor_desc <32 x64 xf32 , #xegpu.layout <sg_layout = [2 , 2 ], sg_data = [16 , 16 ], inst_data = [16 , 16 ]>> -> vector <32 x64 xf32 >
0 commit comments