@@ -921,26 +921,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
921
921
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
922
922
; AVX512-NEXT: vmovdqa 16(%rdx), %xmm3
923
923
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
924
+ ; AVX512-NEXT: vmovdqa (%rsi), %xmm5
925
+ ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm6
926
+ ; AVX512-NEXT: vmovdqa (%rdi), %xmm7
927
+ ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8
928
+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
929
+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
930
+ ; AVX512-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
924
931
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
925
- ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
926
- ; AVX512-NEXT: vmovdqa (%rsi), %xmm3
927
- ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4
928
- ; AVX512-NEXT: vmovdqa (%rdi), %xmm5
929
- ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
930
- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
931
- ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
932
- ; AVX512-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
933
- ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
934
- ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
935
- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
932
+ ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
933
+ ; AVX512-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
934
+ ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
935
+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
936
+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
937
+ ; AVX512-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
936
938
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
937
- ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
938
- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
939
- ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
940
- ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
941
- ; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
942
- ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
943
- ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8)
939
+ ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
940
+ ; AVX512-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
941
+ ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
942
+ ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
943
+ ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
944
944
; AVX512-NEXT: vzeroupper
945
945
; AVX512-NEXT: retq
946
946
;
@@ -951,26 +951,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
951
951
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
952
952
; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
953
953
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
954
+ ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
955
+ ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
956
+ ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
957
+ ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
958
+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
959
+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
960
+ ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
954
961
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
955
- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
956
- ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
957
- ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
958
- ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5
959
- ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
960
- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
961
- ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
962
- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
963
- ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
964
- ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
965
- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
962
+ ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
963
+ ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
964
+ ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
965
+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
966
+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
967
+ ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
966
968
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
967
- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
968
- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
969
- ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
970
- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
971
- ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
972
- ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
973
- ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
969
+ ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
970
+ ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
971
+ ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
972
+ ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
973
+ ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
974
974
; AVX512-FCP-NEXT: vzeroupper
975
975
; AVX512-FCP-NEXT: retq
976
976
;
@@ -981,26 +981,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
981
981
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
982
982
; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3
983
983
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
984
+ ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
985
+ ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm6
986
+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7
987
+ ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm8
988
+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
989
+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
990
+ ; AVX512DQ-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
984
991
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
985
- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
986
- ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3
987
- ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm4
988
- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
989
- ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm6
990
- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
991
- ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
992
- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
993
- ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
994
- ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
995
- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
992
+ ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
993
+ ; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
994
+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
995
+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
996
+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
997
+ ; AVX512DQ-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
996
998
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
997
- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
998
- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
999
- ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1000
- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1001
- ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1002
- ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8)
1003
- ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8)
999
+ ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1000
+ ; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1001
+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1002
+ ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
1003
+ ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
1004
1004
; AVX512DQ-NEXT: vzeroupper
1005
1005
; AVX512DQ-NEXT: retq
1006
1006
;
@@ -1011,26 +1011,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
1011
1011
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
1012
1012
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
1013
1013
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1014
+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
1015
+ ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
1016
+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
1017
+ ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1018
+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1019
+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
1020
+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
1014
1021
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1015
- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
1016
- ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
1017
- ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
1018
- ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5
1019
- ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
1020
- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1021
- ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1022
- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
1023
- ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
1024
- ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
1025
- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1022
+ ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
1023
+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
1024
+ ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
1025
+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1026
+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
1027
+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
1026
1028
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1027
- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
1028
- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1029
- ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1030
- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1031
- ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1032
- ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
1033
- ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
1029
+ ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1030
+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1031
+ ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1032
+ ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1033
+ ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
1034
1034
; AVX512DQ-FCP-NEXT: vzeroupper
1035
1035
; AVX512DQ-FCP-NEXT: retq
1036
1036
;
0 commit comments