@@ -214,3 +214,54 @@ gpu.module @xevm_module{
214214
215215 }
216216}
217+
218+ // -----
219+ // CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
220+ // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
221+ // CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
222+ // CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
223+ // CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
224+ // CHECK: }
225+ // CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
226+ // CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
227+ // CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
228+ // CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
229+ // CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
230+ // CHECK: }
231+ // CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
232+ // CHECK: }
233+ gpu.module @xevm_module {
234+ gpu.func @warp_scf_for_unused_uniform_for_result (%arg0: index ,
235+ %arg1: !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>,
236+ %arg2: memref <16 x16 xf32 >) {
237+ %c128 = arith.constant 128 : index
238+ %c1 = arith.constant 1 : index
239+ %c0 = arith.constant 0 : index
240+ %ini = " some_def" () {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
241+ : () -> (vector <16 x1 xf32 >)
242+ %ini2 = " some_def" () {layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>}
243+ : () -> (vector <16 x16 xf32 >)
244+ %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args (%arg4 = %ini2 , %arg5 = %ini ) -> (vector <16 x16 xf32 >, vector <16 x1 xf32 >) {
245+ %1 = " some_def" (%arg5 )
246+ {
247+ layout_operand_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
248+ layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
249+ }
250+ : (vector <16 x1 xf32 >) -> (vector <16 x1 xf32 >)
251+ %acc = " some_def" (%arg4 , %1 )
252+ {
253+ layout_operand_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
254+ layout_operand_1 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
255+ layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
256+ }
257+ : (vector <16 x16 xf32 >, vector <16 x1 xf32 >) -> (vector <16 x16 xf32 >)
258+ scf.yield %acc , %1 : vector <16 x16 xf32 >, vector <16 x1 xf32 >
259+ }
260+ {
261+ layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
262+ }
263+ xegpu.store_nd %3#0 , %arg1 [%c0 , %c0 ]
264+ : vector <16 x16 xf32 >, !xegpu.tensor_desc <16 x16 xf32 , #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>>
265+ gpu.return
266+ }
267+ }
0 commit comments