@@ -4,14 +4,7 @@ gpu.module @test_round_robin_assignment {
44 // CHECK-LABEL: create_nd_tdesc
55 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
66 gpu.func @create_nd_tdesc (%src: memref <256 x128 xf32 >) {
7- // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
8- // CHECK: %[[C4:.*]] = arith.constant 4 : index
9- // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]]
10- // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]]
11- // CHECK: %[[C8:.*]] = arith.constant 8 : index
12- // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]]
13- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
14- // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
7+ // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
158 // CHECK-NOT: xegpu.create_nd_tdesc
169 %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
1710 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -46,9 +39,7 @@ gpu.module @test_round_robin_assignment {
4639 gpu.func @load_nd_tdesc (%src: memref <256 x128 xf32 >) {
4740 %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
4841 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
49- // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
50- // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
51- // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
42+ // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
5243 // CHECK-NOT: xegpu.load_nd
5344 %load = xegpu.load_nd %tdesc
5445 : !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -61,8 +52,7 @@ gpu.module @test_round_robin_assignment {
6152 gpu.func @store_nd (%src: memref <256 x128 xf32 >) {
6253 %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
6354 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
64- // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
65- // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
55+ // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
6656 // CHECK-NOT: xegpu.store_nd
6757 %load = xegpu.load_nd %tdesc
6858 : !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -77,8 +67,7 @@ gpu.module @test_round_robin_assignment {
7767 gpu.func @update_nd (%src: memref <256 x128 xf32 >){
7868 %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
7969 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
80- // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
81- // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
70+ // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
8271 // CHECK-NOT: xegpu.update_nd_offset
8372 %update = xegpu.update_nd_offset %tdesc , [0 , 16 ]
8473 : !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -88,13 +77,9 @@ gpu.module @test_round_robin_assignment {
8877 // CHECK-LABEL: dpas
8978 // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
9079 gpu.func @dpas (%a: memref <256 x128 xf16 >, %b: memref <128 x256 xf16 >) {
91- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
92- // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
93- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
94- // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
95- // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
96- // CHECK-SAME-COUNT-16: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
97- // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
80+ // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
81+ // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
82+ // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
9883 // CHECK-NOT: xegpu.dpas
9984 %tdesc_a = xegpu.create_nd_tdesc %a [0 , 0 ] : memref <256 x128 xf16 >
10085 -> !xegpu.tensor_desc <256 x128 xf16 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -115,8 +100,7 @@ gpu.module @test_round_robin_assignment {
115100 // CHECK-LABEL: prefetch_nd_tdesc
116101 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
117102 gpu.func @prefetch_nd_tdesc (%src: memref <256 x128 xf32 >) {
118- // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
119- // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
103+ // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
120104 // CHECK-NOT: xegpu.prefetch_nd
121105 %tdesc = xegpu.create_nd_tdesc %src [0 , 0 ] : memref <256 x128 xf32 >
122106 -> !xegpu.tensor_desc <256 x128 xf32 , #xegpu.layout <sg_layout = [8 , 4 ], sg_data = [16 , 16 ], lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
@@ -133,9 +117,7 @@ gpu.module @test_round_robin_assignment {
133117 %load = xegpu.load_nd %tdesc
134118 : !xegpu.tensor_desc <128 x1 xf32 , #xegpu.layout <sg_layout = [4 , 1 ], sg_data = [16 , 1 ], lane_layout = [8 , 1 ], lane_data = [1 , 1 ]>>
135119 -> vector <128 x1 xf32 >
136- // CHECK-COUNT-2: vector.broadcast {{.*}}
137- // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
138- // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
120+ // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
139121 // CHECK-NOT: vector.broadcast
140122 %broadcast = vector.broadcast %load
141123 {layout_result_0 = #xegpu.layout <sg_layout = [4 , 1 ], sg_data = [16 , 32 ], lane_layout = [8 , 1 ], lane_data = [1 , 1 ]>}
0 commit comments