@@ -14,9 +14,14 @@ gpu.module @test_1_1_assignment {
1414 // CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
1515 // CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
1616 // CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
17+ // CHECK: %[[C24:.*]] = arith.constant 24 : index
18+ // CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
1719 // CHECK: %[[C0:.*]] = arith.constant 0 : index
18- // CHECK: %[[ADD1:.*]] = index.add %[[MUL1]], %[[C0]]
19- // CHECK: %[[ADD2:.*]] = index.add %[[MUL2]], %[[C0]]
20+ // CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
21+ // CHECK: %[[C32:.*]] = arith.constant 32 : index
22+ // CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
23+ // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
24+ // CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
2025 // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
2126 // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
2227 // CHECK: gpu.return
@@ -108,6 +113,40 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
108113 gpu.return
109114 }
110115
116+
117+ // CHECK-LABEL: test_dpas_no_sg_data
118+ // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
119+ // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
120+ gpu.func @test_dpas_no_sg_data (%a: memref <24 x32 xf32 >, %b: memref <32 x24 xf32 >) {
121+ // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
122+ // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
123+ // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
124+ // CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
125+ // CHECK-SAME: -> vector<12x8xf32>
126+ // CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
127+ // CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
128+ // CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
129+ // CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
130+ // CHECK-SAME: -> vector<8x12xf32>
131+ // CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
132+ // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
133+ // CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
134+ %tdesc_a = xegpu.create_nd_tdesc %a [0 , 0 ] : memref <24 x32 xf32 >
135+ -> !xegpu.tensor_desc <24 x32 xf32 , #xegpu.layout <sg_layout = [2 , 4 ], lane_layout = [2 , 8 ], lane_data = [1 , 1 ]>>
136+ %load_a = xegpu.load_nd %tdesc_a
137+ : !xegpu.tensor_desc <24 x32 xf32 , #xegpu.layout <sg_layout = [2 , 4 ], lane_layout = [2 , 8 ], lane_data = [1 , 1 ]>>
138+ -> vector <24 x32 xf32 >
139+ %tdesc_b = xegpu.create_nd_tdesc %b [0 , 0 ] : memref <32 x24 xf32 >
140+ -> !xegpu.tensor_desc <32 x24 xf32 , #xegpu.layout <sg_layout = [4 , 2 ], lane_layout = [8 , 2 ], lane_data = [1 , 1 ]>>
141+ %load_b = xegpu.load_nd %tdesc_b
142+ : !xegpu.tensor_desc <32 x24 xf32 , #xegpu.layout <sg_layout = [4 , 2 ], lane_layout = [8 , 2 ], lane_data = [1 , 1 ]>>
143+ -> vector <32 x24 xf32 >
144+ %dpas = xegpu.dpas %load_a , %load_b
145+ {layout = #xegpu.layout <sg_layout = [2 , 2 ], lane_layout = [2 , 2 ], lane_data = [1 , 1 ]>}
146+ : vector <24 x32 xf32 >, vector <32 x24 xf32 > -> vector <24 x24 xf32 >
147+ gpu.return
148+ }
149+
111150 // CHECK-LABEL: test_prefetch_nd_tdesc
112151 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
113152 gpu.func @test_prefetch_nd_tdesc (%src: memref <24 x32 xf32 >) {
0 commit comments