@@ -51,6 +51,31 @@ func.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf1
5151 return
5252}
5353
54+ // -----
55+ #a = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
56+ #b = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 2 ]>
57+ #bt = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>
58+ func.func @nested_scf (%arg0: memref <256 x256 xf16 >, %arg1: memref <256 x256 xf16 >, %arg2: memref <256 x256 xf32 >) {
59+ %c0 = arith.constant 0 : index
60+ %c16 = arith.constant 16 : index
61+ %c256 = arith.constant 256 : index
62+ scf.for %arg8 = %c0 to %c256 step %c16 {
63+ %0 = xegpu.create_nd_tdesc %arg2 : memref <256 x256 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #a >
64+ %1 = xegpu.load_nd %0 [%arg8 , %c0 ] { layout_result_0 = #a } : !xegpu.tensor_desc <8 x16 xf32 , #a > -> vector <8 x16 xf32 >
65+ %2 = xegpu.create_nd_tdesc %arg0 : memref <256 x256 xf16 > -> !xegpu.tensor_desc <8 x16 xf16 , #a >
66+ %3 = xegpu.create_nd_tdesc %arg1 : memref <256 x256 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #b >
67+ %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args (%arg4 = %1 ) -> (vector <8 x16 xf32 >) {
68+ %5 = xegpu.load_nd %2 [%arg8 , %arg3 ] { layout_result_0 = #a } : !xegpu.tensor_desc <8 x16 xf16 , #a > -> vector <8 x16 xf16 >
69+ %6 = xegpu.load_nd %3 [%arg8 , %arg3 ] { layout_result_0 = #b } : !xegpu.tensor_desc <16 x16 xf16 , #b > -> vector <16 x16 xf16 >
70+ %7 = vector.transpose %6 , [1 , 0 ] { layout_result_0 = #bt } : vector <16 x16 xf16 > to vector <16 x16 xf16 >
71+ %8 = xegpu.dpas %5 , %7 , %arg4 {layout_result_0 = #a } : vector <8 x16 xf16 >, vector <16 x16 xf16 >, vector <8 x16 xf32 > -> vector <8 x16 xf32 >
72+ scf.yield %8 : vector <8 x16 xf32 >
73+ } {layout_result_0 = #a }
74+ xegpu.store_nd %4 , %0 [%c0 , %c0 ] : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #a >
75+ }
76+ return
77+ }
78+
5479// -----
5580#a = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
5681#b = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 2 ]>
@@ -90,28 +115,3 @@ func.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
90115 xegpu.store_nd %4#3 , %0 [%c16 , %c16 ] : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #a >
91116 return
92117}
93-
94- // -----
95- #a = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
96- #b = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 2 ]>
97- #bt = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [2 , 1 ]>
98- func.func @nested_scf (%arg0: memref <256 x256 xf16 >, %arg1: memref <256 x256 xf16 >, %arg2: memref <256 x256 xf32 >) {
99- %c0 = arith.constant 0 : index
100- %c16 = arith.constant 16 : index
101- %c256 = arith.constant 256 : index
102- scf.for %arg8 = %c0 to %c256 step %c16 {
103- %0 = xegpu.create_nd_tdesc %arg2 : memref <256 x256 xf32 > -> !xegpu.tensor_desc <8 x16 xf32 , #a >
104- %1 = xegpu.load_nd %0 [%arg8 , %c0 ] { layout_result_0 = #a } : !xegpu.tensor_desc <8 x16 xf32 , #a > -> vector <8 x16 xf32 >
105- %2 = xegpu.create_nd_tdesc %arg0 : memref <256 x256 xf16 > -> !xegpu.tensor_desc <8 x16 xf16 , #a >
106- %3 = xegpu.create_nd_tdesc %arg1 : memref <256 x256 xf16 > -> !xegpu.tensor_desc <16 x16 xf16 , #b >
107- %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args (%arg4 = %1 ) -> (vector <8 x16 xf32 >) {
108- %5 = xegpu.load_nd %2 [%arg8 , %arg3 ] { layout_result_0 = #a } : !xegpu.tensor_desc <8 x16 xf16 , #a > -> vector <8 x16 xf16 >
109- %6 = xegpu.load_nd %3 [%arg8 , %arg3 ] { layout_result_0 = #b } : !xegpu.tensor_desc <16 x16 xf16 , #b > -> vector <16 x16 xf16 >
110- %7 = vector.transpose %6 , [1 , 0 ] { layout_result_0 = #bt } : vector <16 x16 xf16 > to vector <16 x16 xf16 >
111- %8 = xegpu.dpas %5 , %7 , %arg4 {layout_result_0 = #a } : vector <8 x16 xf16 >, vector <16 x16 xf16 >, vector <8 x16 xf32 > -> vector <8 x16 xf32 >
112- scf.yield %8 : vector <8 x16 xf32 >
113- } {layout_result_0 = #a }
114- xegpu.store_nd %4 , %0 [%c0 , %c0 ] : vector <8 x16 xf32 >, !xegpu.tensor_desc <8 x16 xf32 , #a >
115- }
116- return
117- }
0 commit comments