@@ -11,7 +11,7 @@ gpu.module @test {
1111 %c0 = arith.constant 0 : index
1212 %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>} dense <1.000000e+00 > : vector <16 xf32 >
1313 %0 = xegpu.create_nd_tdesc %arg0 [%c0 ] : memref <16 xf32 > -> !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
14- xegpu.store_nd %cst , %0 { layout_operand_0 = #xegpu.layout < lane_layout = [ 16 ], lane_data = [ 1 ]>} : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
14+ xegpu.store_nd %cst , %0 : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
1515 gpu.return
1616 }
1717}
@@ -27,7 +27,7 @@ gpu.module @test {
2727 %c0 = arith.constant 0 : index
2828 %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} dense <1.000000e+00 > : vector <16 x16 xf16 >
2929 %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
30- xegpu.store_nd %cst , %0 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
30+ xegpu.store_nd %cst , %0 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
3131 gpu.return
3232 }
3333}
@@ -47,7 +47,7 @@ gpu.module @test {
4747 %0 = xegpu.create_nd_tdesc %arg0 [%c0 ] : memref <16 xf32 > -> !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
4848 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>} : !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>> -> vector <16 xf32 >
4949 %2 = xegpu.create_nd_tdesc %arg1 [%c0 ] : memref <16 xf32 > -> !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
50- xegpu.store_nd %1 , %2 { layout_operand_0 = #xegpu.layout < lane_layout = [ 16 ], lane_data = [ 1 ]>} : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
50+ xegpu.store_nd %1 , %2 : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
5151 gpu.return
5252 }
5353}
@@ -65,7 +65,7 @@ gpu.module @test {
6565 %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
6666 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <16 x16 xf16 >
6767 %2 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
68- xegpu.store_nd %1 , %2 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
68+ xegpu.store_nd %1 , %2 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
6969 gpu.return
7070 }
7171}
@@ -85,9 +85,9 @@ gpu.module @test {
8585 %c0 = arith.constant 0 : index
8686 %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.block_tdesc_attr <array_length = 2 : i64 >, #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
8787 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <16 x16 xf16 , #xegpu.block_tdesc_attr <array_length = 2 : i64 >, #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <2 x16 x16 xf16 >
88- %2 = vector.extract %1 [%c0 ] {layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <16 x16 xf16 > from vector <2 x16 x16 xf16 >
88+ %2 = vector.extract %1 [%c0 ] {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <16 x16 xf16 > from vector <2 x16 x16 xf16 >
8989 %3 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
90- xegpu.store_nd %2 , %3 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
90+ xegpu.store_nd %2 , %3 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
9191 gpu.return
9292 }
9393}
@@ -109,9 +109,9 @@ gpu.module @test {
109109 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <8 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <8 x16 xf16 >
110110 %2 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>>
111111 %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>} : !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>> -> vector <16 x16 xf16 >
112- %4 = xegpu.dpas %1 , %3 {layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_operand_1 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 2 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf16 >, vector <16 x16 xf16 > -> vector <8 x16 xf32 >
112+ %4 = xegpu.dpas %1 , %3 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf16 >, vector <16 x16 xf16 > -> vector <8 x16 xf32 >
113113 %5 = xegpu.create_nd_tdesc %arg2 [%c0 , %c0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
114- xegpu.store_nd %4 , %5 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
114+ xegpu.store_nd %4 , %5 : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
115115 gpu.return
116116 }
117117}
@@ -137,10 +137,10 @@ gpu.module @test {
137137 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <8 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <8 x16 xf16 >
138138 %2 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ] : memref <16 x16 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>>
139139 %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>} : !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>> -> vector <16 x16 xf16 >
140- %4 = xegpu.dpas %1 , %3 {layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_operand_1 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 2 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf16 >, vector <16 x16 xf16 > -> vector <8 x16 xf32 >
141- %5 = math.exp %4 {layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf32 >
140+ %4 = xegpu.dpas %1 , %3 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf16 >, vector <16 x16 xf16 > -> vector <8 x16 xf32 >
141+ %5 = math.exp %4 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xf32 >
142142 %6 = xegpu.create_nd_tdesc %arg2 [%c0 , %c0 ] : memref <8 x16 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
143- xegpu.store_nd %5 , %6 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
143+ xegpu.store_nd %5 , %6 : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
144144 gpu.return
145145 }
146146}
@@ -160,7 +160,7 @@ gpu.module @test {
160160 %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ], [%arg2 , %arg3 ], [%arg4 , %arg5 ] : ui64 -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
161161 %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <16 x16 xf16 >
162162 %2 = xegpu.create_nd_tdesc %arg1 [%c0 , %c0 ], [%arg2 , %arg3 ], [%arg4 , %arg5 ] : ui64 -> !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
163- xegpu.store_nd %1 , %2 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
163+ xegpu.store_nd %1 , %2 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
164164 gpu.return
165165 }
166166}
@@ -205,10 +205,10 @@ gpu.func @gemm_loop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>
205205 %6 = xegpu.create_nd_tdesc %arg1 [%arg3 , %1 ] : memref <1024 x1024 xbf16 > -> !xegpu.tensor_desc <16 x16 xbf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>>
206206 %7 = xegpu.load_nd %5 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : !xegpu.tensor_desc <8 x16 xbf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>> -> vector <8 x16 xbf16 >
207207 %8 = xegpu.load_nd %6 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>} : !xegpu.tensor_desc <16 x16 xbf16 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>> -> vector <16 x16 xbf16 >
208- %9 = xegpu.dpas %7 , %8 , %arg4 {layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_operand_1 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 2 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xbf16 >, vector <16 x16 xbf16 >, vector <8 x16 xf32 > -> vector <8 x16 xf32 >
209- scf.yield { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} %9 : vector <8 x16 xf32 >
210- } {layout_operand_3 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>, layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
211- xegpu.store_nd %4 , %2 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
208+ %9 = xegpu.dpas %7 , %8 , %arg4 {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} : vector <8 x16 xbf16 >, vector <16 x16 xbf16 >, vector <8 x16 xf32 > -> vector <8 x16 xf32 >
209+ scf.yield %9 : vector <8 x16 xf32 >
210+ } {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
211+ xegpu.store_nd %4 , %2 : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
212212 gpu.return
213213}
214214}
@@ -227,7 +227,7 @@ gpu.module @test {
227227 %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>} dense <1.000000e+00 > : vector <16 xf32 >
228228 %0 = xegpu.create_nd_tdesc %arg0 [%c0 ] : memref <256 xf32 > -> !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
229229 %1 = xegpu.update_nd_offset %0 , [%c32 ] : !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
230- xegpu.store_nd %cst , %1 { layout_operand_0 = #xegpu.layout < lane_layout = [ 16 ], lane_data = [ 1 ]>} : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
230+ xegpu.store_nd %cst , %1 : vector <16 xf32 >, !xegpu.tensor_desc <16 xf32 , #xegpu.layout <lane_layout = [16 ], lane_data = [1 ]>>
231231 gpu.return
232232 }
233233}
@@ -246,7 +246,7 @@ gpu.module @test {
246246 %cst = arith.constant {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>} dense <1.000000e+00 > : vector <16 x16 xf32 >
247247 %0 = xegpu.create_nd_tdesc %arg0 [%c0 , %c0 ] : memref <256 x256 xf32 > -> !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
248248 %1 = xegpu.update_nd_offset %0 , [%c32 , %c32 ] : !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
249- xegpu.store_nd %cst , %1 { layout_operand_0 = #xegpu.layout < lane_layout = [ 1 , 16 ], lane_data = [ 1 , 1 ]>} : vector <16 x16 xf32 >, !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
249+ xegpu.store_nd %cst , %1 : vector <16 x16 xf32 >, !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
250250 gpu.return
251251 }
252252}
0 commit comments