@@ -455,7 +455,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
455455}
456456
457457// -----
458- // CHECK-LABEL: func.func @test_scf_while_and_condition (
458+ // CHECK-LABEL: func.func @scf_while_and_condition (
459459// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
460460// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
461461// CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
@@ -464,7 +464,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
464464// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>):
465465// CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
466466// CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
467- func.func @test_scf_while_and_condition (%arg0: memref <256 xf32 >, %arg1: memref <256 xf32 >) {
467+ func.func @scf_while_and_condition (%arg0: memref <256 xf32 >, %arg1: memref <256 xf32 >) {
468468 %c0 = arith.constant 0 : i32
469469 %c16 = arith.constant 16 : i32
470470 %c256 = arith.constant 256 : i32
@@ -486,3 +486,79 @@ func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<25
486486 }
487487 return
488488}
489+
490+ // -----
491+ // CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim0_distributed(
492+ // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x1xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>,
493+ // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
494+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]]
495+ // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
496+ // CHECK-SAME: !xegpu.tensor_desc<16x1xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x1xf16>
497+ // CHECK-NEXT: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
498+ // CHECK-SAME: : vector<16x1xf16> to vector<16xf16>
499+ func.func @vector_shape_cast_2d_to_1d_dim0_distributed (%arg0: !xegpu.tensor_desc <16 x1 xf16 >, %arg1: !xegpu.tensor_desc <16 xf16 >) {
500+ %c0 = arith.constant 0 : index
501+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc <16 x1 xf16 > -> vector <16 x1 xf16 >
502+ %2 = vector.shape_cast %3 : vector <16 x1 xf16 > to vector <16 xf16 >
503+ xegpu.store_nd %2 , %arg1 : vector <16 xf16 >, !xegpu.tensor_desc <16 xf16 >
504+ return
505+ }
506+
507+ // -----
508+ // CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim1_distributed(
509+ // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<1x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
510+ // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
511+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
512+ // CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x16xf16>
513+ // CHECK: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
514+ // CHECK-SAME: vector<1x16xf16> to vector<16xf16>
515+ func.func @vector_shape_cast_2d_to_1d_dim1_distributed (%arg0: !xegpu.tensor_desc <1 x16 xf16 >, %arg1: !xegpu.tensor_desc <16 xf16 >) {
516+ %c0 = arith.constant 0 : index
517+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc <1 x16 xf16 > -> vector <1 x16 xf16 >
518+ %2 = vector.shape_cast %3 : vector <1 x16 xf16 > to vector <16 xf16 >
519+ xegpu.store_nd %2 , %arg1 : vector <16 xf16 >, !xegpu.tensor_desc <16 xf16 >
520+ return
521+ }
522+
523+ // -----
524+ // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
525+ // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
526+ // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
527+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
528+ // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
529+ // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
530+ // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf16> to vector<16xf16>
531+ // CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
532+ // CHECK-SAME: vector<16xf16> to vector<1x16xf16>
533+ func.func @vector_shape_cast_1d_to_2d_dim1_distributed (%arg0: !xegpu.tensor_desc <16 x16 xf16 >, %arg1: !xegpu.tensor_desc <16 x16 xf16 >) {
534+ %c0 = arith.constant 0 : index
535+ %cst = arith.constant dense <0.0000 > : vector <16 xf16 >
536+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc <16 x16 xf16 > -> vector <16 x16 xf16 >
537+ %4 = vector.multi_reduction <add >, %3 , %cst [0 ] : vector <16 x16 xf16 > to vector <16 xf16 >
538+ %2 = vector.shape_cast %4 : vector <16 xf16 > to vector <1 x16 xf16 >
539+ %5 = vector.broadcast %2 : vector <1 x16 xf16 > to vector <16 x16 xf16 >
540+ xegpu.store_nd %5 , %arg1 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 >
541+ return
542+ }
543+
544+ // -----
545+ // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
546+ // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
547+ // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
548+ // CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
549+ // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
550+ // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
551+ // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
552+ // CHECK-SAME: vector<16x16xf16> to vector<16xf16>
553+ // CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
554+ // CHECK-SAME: vector<16xf16> to vector<16x1xf16>
555+ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted (%arg0: !xegpu.tensor_desc <16 x16 xf16 >, %arg1: !xegpu.tensor_desc <16 x16 xf16 >) {
556+ %c0 = arith.constant 0 : index
557+ %cst = arith.constant dense <0.0000 > : vector <16 xf16 >
558+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc <16 x16 xf16 > -> vector <16 x16 xf16 >
559+ %4 = vector.multi_reduction <add >, %3 , %cst [1 ] : vector <16 x16 xf16 > to vector <16 xf16 >
560+ %2 = vector.shape_cast %4 : vector <16 xf16 > to vector <16 x1 xf16 >
561+ %5 = vector.broadcast %2 : vector <16 x1 xf16 > to vector <16 x16 xf16 >
562+ xegpu.store_nd %5 , %arg1 : vector <16 x16 xf16 >, !xegpu.tensor_desc <16 x16 xf16 >
563+ return
564+ }
0 commit comments