@@ -367,6 +367,46 @@ gpu.module @test_distribution {
367
367
gpu.return
368
368
}
369
369
370
+ // CHECK-LABEL: @vector_reduce_dim_0
371
+ gpu.func @vector_reduce_dim_0 (%src: memref <4 x128 xf32 >) {
372
+ %cst = arith.constant {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [1 , 32 ], sg_data = [4 , 4 ]>, dims = [0 ]>} dense <1.0 > : vector <128 xf32 >
373
+ %tdesc = xegpu.create_nd_tdesc %src : memref <4 x128 xf32 >
374
+ -> !xegpu.tensor_desc <4 x128 xf32 , #xegpu.layout <sg_layout = [1 , 32 ], sg_data = [4 , 4 ]>>
375
+ %load = xegpu.load_nd %tdesc [0 , 0 ]
376
+ : !xegpu.tensor_desc <4 x128 xf32 , #xegpu.layout <sg_layout = [1 , 32 ], sg_data = [4 , 4 ]>>
377
+ -> vector <4 x128 xf32 >
378
+ // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
379
+ %reduce = vector.multi_reduction <add >, %load , %cst {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [1 , 32 ], sg_data = [4 , 4 ]>, dims = [0 ]>} [0 ]
380
+ : vector <4 x128 xf32 > to vector <128 xf32 >
381
+ gpu.return
382
+ }
383
+
384
+ // CHECK-LABEL: @vector_reduce_dim_1
385
+ gpu.func @vector_reduce_dim_1 (%src: memref <256 x64 xf32 >) {
386
+ %cst = arith.constant {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [16 , 1 ], sg_data = [16 , 64 ]>, dims = [1 ]>} dense <1.0 > : vector <256 xf32 >
387
+ %tdesc = xegpu.create_nd_tdesc %src : memref <256 x64 xf32 >
388
+ -> !xegpu.tensor_desc <256 x64 xf32 , #xegpu.layout <sg_layout = [16 , 1 ], sg_data = [16 , 64 ]>>
389
+ %load = xegpu.load_nd %tdesc [0 , 0 ]
390
+ : !xegpu.tensor_desc <256 x64 xf32 , #xegpu.layout <sg_layout = [16 , 1 ], sg_data = [16 , 64 ]>>
391
+ -> vector <256 x64 xf32 >
392
+ // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
393
+ %reduce = vector.multi_reduction <add >, %load , %cst {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [16 , 1 ], sg_data = [16 , 64 ]>, dims = [1 ]>} [1 ]
394
+ : vector <256 x64 xf32 > to vector <256 xf32 >
395
+ gpu.return
396
+ }
397
+
398
+ // CHECK-LABEL: @vector_reduce_4D
399
+ gpu.func @vector_reduce_4D (%src: ui64 ) {
400
+ %cst_acc = arith.constant {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [4 , 2 , 6 , 1 ], sg_data = [1 , 1 , 1 , 32 ]>, dims = [3 ]>} dense <0.0 > : vector <4 x2 x6 xf16 >
401
+ %offset = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [4 , 2 , 6 , 1 ], sg_data = [1 , 1 , 1 , 32 ]>} dense <0 > : vector <4 x2 x6 x32 xindex >
402
+ %mask = arith.constant {layout_result_0 = #xegpu.layout <sg_layout = [4 , 2 , 6 , 1 ], sg_data = [1 , 1 , 1 , 32 ]>} dense <true > : vector <4 x2 x6 x32 xi1 >
403
+ %load = xegpu.load %src [%offset ], %mask {layout_result_0 = #xegpu.layout <sg_layout = [4 , 2 , 6 , 1 ], sg_data = [1 , 1 , 1 , 32 ]>} : ui64 , vector <4 x2 x6 x32 xindex >, vector <4 x2 x6 x32 xi1 > -> vector <4 x2 x6 x32 xf16 >
404
+ // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
405
+ %reduce = vector.multi_reduction <add >, %load , %cst_acc {layout_result_0 = #xegpu.slice <#xegpu.layout <sg_layout = [4 , 2 , 6 , 1 ], sg_data = [1 , 1 , 1 , 32 ]>, dims = [3 ]>} [3 ]
406
+ : vector <4 x2 x6 x32 xf16 > to vector <4 x2 x6 xf16 >
407
+ gpu.return
408
+ }
409
+
370
410
// CHECK-LABEL: vector_step_op
371
411
gpu.func @vector_step_op_slice_attr () {
372
412
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
0 commit comments