@@ -186,22 +186,29 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
186186 %4 = xegpu.create_nd_tdesc %arg0 [%0 , %c0 ] : memref <1024 x1024 xf16 > -> !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>
187187 %5 = xegpu.create_nd_tdesc %arg1 [%c0 , %1 ] : memref <1024 x1024 xf16 > -> !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>
188188
189- //CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
190- //CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
191- //CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
192- //CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
193- //CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
194- //CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
195- //CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
196- %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args (%arg4 = %4 , %arg5 = %5 , %arg6 = %3 ) -> (!xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>, !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>, vector <128 x128 xf32 >) {
189+ // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
190+ // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
191+ // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
192+ // CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
193+ // CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
194+ // CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
195+ // CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
196+ // CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
197+ // CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
198+ %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args (%arg4 = %4 , %arg5 = %5 , %arg6 = %3 )
199+ -> (!xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>,
200+ !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>, vector <128 x128 xf32 >) {
197201 %8 = xegpu.load_nd %arg4 : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>> -> vector <128 x128 xf16 >
198202 %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>> -> vector <128 x128 xf16 >
199- %10 = xegpu.dpas %8 , %9 , %arg6 {layout_result_0 = #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 16 ]>} : vector <128 x128 xf16 >, vector <128 x128 xf16 >, vector <128 x128 xf32 > -> vector <128 x128 xf32 >
203+ %10 = xegpu.dpas %8 , %9 , %arg6 {layout_result_0 = #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 16 ]>}
204+ : vector <128 x128 xf16 >, vector <128 x128 xf16 >, vector <128 x128 xf32 > -> vector <128 x128 xf32 >
200205 %11 = xegpu.update_nd_offset %arg4 , [%c0 , %c128 ] : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>
201206 %12 = xegpu.update_nd_offset %arg5 , [%c128 , %c0 ] : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>
202- scf.yield %11 , %12 , %10 : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>, !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>, vector <128 x128 xf32 >
207+ scf.yield %11 , %12 , %10 : !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 128 ]>>,
208+ !xegpu.tensor_desc <128 x128 xf16 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [128 , 16 ]>>, vector <128 x128 xf32 >
203209 }
204- %7 = xegpu.create_nd_tdesc %arg2 [%0 , %1 ] : memref <1024 x1024 xf32 > -> !xegpu.tensor_desc <128 x128 xf32 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 16 ]>>
210+ %7 = xegpu.create_nd_tdesc %arg2 [%0 , %1 ] : memref <1024 x1024 xf32 >
211+ -> !xegpu.tensor_desc <128 x128 xf32 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 16 ]>>
205212 xegpu.store_nd %6#2 , %7 : vector <128 x128 xf32 >, !xegpu.tensor_desc <128 x128 xf32 , #xegpu.layout <sg_layout = [8 , 8 ], sg_data = [16 , 16 ]>>
206213 gpu.return
207214 }
0 commit comments