@@ -607,25 +607,25 @@ gpu.func @vector_shapecast_unsupported(%laneid: index) {
607607}
608608
609609
610- // CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
611- // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32 >, vector<2x16xf32 >) {
612- // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<32x16xf32 >
613- // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32 >, vector<32x16xf32 >
614- // CHECK: }
615- // CHECK-NEXT: %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
616- // CHECK-NEXT : %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32 > to vector<1x16xf32 >
617- // CHECK-NEXT: "some_use"(%[[T2 ]]) : (vector<1x16xf32 >) -> ()
618- gpu.func @vector_extract_strided_slice_outer_distributed (%laneid: index ) {
619- %r = gpu.warp_execute_on_lane_0 (%laneid )[16 ] -> (vector <1 x 16 x f32 >) {
620- %0 = " some_def" () : () -> (vector <32 x 16 x f32 >)
621- %1 = vector.extract_strided_slice %0 { offsets = [16 ], sizes = [16 ], strides = [1 ],
622- layout_operand_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>,
623- layout_result_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>
610+ // CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
611+ // CHECK-NEXT : %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32 >, vector<24x1xf32 >) {
612+ // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32 >
613+ // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32 >, vector<24x16xf32 >
614+ // CHECK-NEXT: }
615+ // CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
616+ // CHECK-SAME : {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32 > to vector<8x1xf32 >
617+ // CHECK-NEXT: "some_use"(%[[T1 ]]) : (vector<8x1xf32 >) -> ()
618+ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted (%laneid: index ) {
619+ %r = gpu.warp_execute_on_lane_0 (%laneid )[16 ] -> (vector <8 x 1 x f32 >) {
620+ %0 = " some_def" () : () -> (vector <24 x 16 x f32 >)
621+ %1 = vector.extract_strided_slice %0 { offsets = [8 , 0 ], sizes = [8 , 16 ], strides = [1 , 1 ],
622+ layout_operand_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
623+ layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
624624 }
625- : vector <32 x 16 x f32 > to vector <16 x 16 x f32 >
626- gpu.yield %1 : vector <16 x 16 x f32 >
625+ : vector <24 x 16 x f32 > to vector <8 x 16 x f32 >
626+ gpu.yield %1 : vector <8 x 16 x f32 >
627627 }
628- " some_use" (%r ) : (vector <1 x 16 x f32 >) -> ()
628+ " some_use" (%r ) : (vector <8 x 1 x f32 >) -> ()
629629 gpu.return
630630}
631631
@@ -651,6 +651,28 @@ gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) {
651651 gpu.return
652652}
653653
654+ // CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
655+ // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32>, vector<2x16xf32>) {
656+ // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<32x16xf32>
657+ // CHECK: gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32>, vector<32x16xf32>
658+ // CHECK: }
659+ // CHECK-NEXT: %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
660+ // CHECK-NEXT: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32> to vector<1x16xf32>
661+ // CHECK-NEXT: "some_use"(%[[T2]]) : (vector<1x16xf32>) -> ()
662+ gpu.func @vector_extract_strided_slice_outer_distributed (%laneid: index ) {
663+ %r = gpu.warp_execute_on_lane_0 (%laneid )[16 ] -> (vector <1 x16 xf32 >) {
664+ %0 = " some_def" () : () -> (vector <32 x16 xf32 >)
665+ %1 = vector.extract_strided_slice %0 { offsets = [16 ], sizes = [16 ], strides = [1 ],
666+ layout_operand_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>,
667+ layout_result_0 = #xegpu.layout <lane_layout = [16 , 1 ], lane_data = [1 , 1 ]>
668+ }
669+ : vector <32 x16 xf32 > to vector <16 x16 xf32 >
670+ gpu.yield %1 : vector <16 x16 xf32 >
671+ }
672+ " some_use" (%r ) : (vector <1 x16 xf32 >) -> ()
673+ gpu.return
674+ }
675+
654676// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
655677// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) {
656678// CHECK: %[[S:.*]] = "some_def"() : () -> vector<64xf32>
@@ -709,6 +731,32 @@ gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
709731 gpu.return
710732}
711733
734+
735+ // CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
736+ // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) {
737+ // CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
738+ // CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32>
739+ // CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16x16xf32>, vector<64x16xf32>
740+ // CHECK-NEXT: }
741+ // CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
742+ // CHECK-SAME: {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
743+ // CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
744+ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted (%laneid: index ) {
745+ %r = gpu.warp_execute_on_lane_0 (%laneid )[16 ] -> (vector <64 x1 xf32 >) {
746+ %0 = " some_def" () : () -> (vector <16 x16 xf32 >)
747+ %1 = " some_def" () : () -> (vector <64 x16 xf32 >)
748+ %2 = vector.insert_strided_slice %0 , %1 { offsets = [24 , 0 ], strides = [1 , 1 ],
749+ layout_operand_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
750+ layout_operand_1 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>,
751+ layout_result_0 = #xegpu.layout <lane_layout = [1 , 16 ], lane_data = [1 , 1 ]>
752+ }
753+ : vector <16 x16 xf32 > into vector <64 x16 xf32 >
754+ gpu.yield %2 : vector <64 x16 xf32 >
755+ }
756+ " some_use" (%r ) : (vector <64 x1 xf32 >) -> ()
757+ gpu.return
758+ }
759+
712760// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
713761// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
714762// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
0 commit comments