Skip to content

Commit b9cb931

Browse files
committed
[X86] combineConcatVectorOps - convert X86ISD::VPERMV3 concatenation to use combineConcatVectorOps recursion
Only concatenate X86ISD::VPERMV3 nodes if at least one operand is beneficial to concatenate Also add missing useAVX512Regs check
1 parent a7f1dc0 commit b9cb931

File tree

3 files changed

+116
-109
lines changed

3 files changed

+116
-109
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58106,6 +58106,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5810658106
}
5810758107
break;
5810858108
case X86ISD::VPERMV:
58109+
// TODO: Handle 256-bit and NumOps == 4 cases.
5810958110
if (!IsSplat && NumOps == 2 &&
5811058111
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
5811158112
MVT OpVT = Op0.getSimpleValueType();
@@ -58131,7 +58132,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5813158132
}
5813258133
break;
5813358134
case X86ISD::VPERMV3:
58134-
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58135+
// TODO: Handle 256-bit and NumOps == 4 cases.
58136+
if (!IsSplat && NumOps == 2 &&
58137+
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
5813558138
MVT OpVT = Op0.getSimpleValueType();
5813658139
int NumSrcElts = OpVT.getVectorNumElements();
5813758140
SmallVector<int, 64> ConcatMask;
@@ -58153,10 +58156,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5815358156
}
5815458157
}
5815558158
if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58156-
SDValue Src0 = ConcatSubOperand(VT, Ops, 0);
58157-
SDValue Src1 = ConcatSubOperand(VT, Ops, 2);
58158-
return lowerShuffleWithPERMV(DL, VT, ConcatMask, Src0, Src1,
58159-
Subtarget, DAG);
58159+
SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58160+
SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58161+
if (Concat0 || Concat1)
58162+
return lowerShuffleWithPERMV(
58163+
DL, VT, ConcatMask,
58164+
Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58165+
Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58166+
DAG);
5816058167
}
5816158168
}
5816258169
break;

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,24 +1049,24 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
10491049
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
10501050
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
10511051
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm5
1052+
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
1053+
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7
1054+
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,2,5,7]
1055+
; AVX512-FCP-NEXT: vpermt2q %ymm5, %ymm8, %ymm7
10521056
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1053-
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
1054-
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
1055-
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm6
1056-
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm5
1057-
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
1058-
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15]
1059-
; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
1060-
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1061-
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4
1062-
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1063-
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
1064-
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1065-
; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3
1066-
; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1057+
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm5
1058+
; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm8, %ymm5
1059+
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0
1060+
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1061+
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
1062+
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1063+
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
1064+
; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm8, %ymm3
1065+
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
1066+
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1067+
; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm8, %ymm1
10671068
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1068-
; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1
1069-
; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
1069+
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
10701070
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rdx)
10711071
; AVX512-FCP-NEXT: vzeroupper
10721072
; AVX512-FCP-NEXT: retq
@@ -1110,24 +1110,24 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
11101110
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
11111111
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
11121112
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm5
1113+
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
1114+
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7
1115+
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,2,5,7]
1116+
; AVX512DQ-FCP-NEXT: vpermt2q %ymm5, %ymm8, %ymm7
11131117
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1114-
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
1115-
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
1116-
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm6
1117-
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm5
1118-
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
1119-
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15]
1120-
; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
1121-
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1122-
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4
1123-
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1124-
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
1125-
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1126-
; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3
1127-
; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1118+
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm5
1119+
; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm8, %ymm5
1120+
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0
1121+
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1122+
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
1123+
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1124+
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
1125+
; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm8, %ymm3
1126+
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
1127+
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
1128+
; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm8, %ymm1
11281129
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
1129-
; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1
1130-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
1130+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
11311131
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rdx)
11321132
; AVX512DQ-FCP-NEXT: vzeroupper
11331133
; AVX512DQ-FCP-NEXT: retq

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll

Lines changed: 72 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -921,26 +921,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
921921
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
922922
; AVX512-NEXT: vmovdqa 16(%rdx), %xmm3
923923
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
924+
; AVX512-NEXT: vmovdqa (%rsi), %xmm5
925+
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm6
926+
; AVX512-NEXT: vmovdqa (%rdi), %xmm7
927+
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8
928+
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
929+
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
930+
; AVX512-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
924931
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
925-
; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
926-
; AVX512-NEXT: vmovdqa (%rsi), %xmm3
927-
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4
928-
; AVX512-NEXT: vmovdqa (%rdi), %xmm5
929-
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
930-
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
931-
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
932-
; AVX512-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
933-
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
934-
; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
935-
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
932+
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
933+
; AVX512-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
934+
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
935+
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
936+
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
937+
; AVX512-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
936938
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
937-
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
938-
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
939-
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
940-
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
941-
; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
942-
; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
943-
; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8)
939+
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
940+
; AVX512-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
941+
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
942+
; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
943+
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
944944
; AVX512-NEXT: vzeroupper
945945
; AVX512-NEXT: retq
946946
;
@@ -951,26 +951,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
951951
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
952952
; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
953953
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
954+
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
955+
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
956+
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
957+
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
958+
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
959+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
960+
; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
954961
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
955-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
956-
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
957-
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
958-
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5
959-
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
960-
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
961-
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
962-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
963-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
964-
; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
965-
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
962+
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
963+
; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
964+
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
965+
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
966+
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
967+
; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
966968
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
967-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
968-
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
969-
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
970-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
971-
; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
972-
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
973-
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
969+
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
970+
; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
971+
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
972+
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
973+
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
974974
; AVX512-FCP-NEXT: vzeroupper
975975
; AVX512-FCP-NEXT: retq
976976
;
@@ -981,26 +981,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
981981
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
982982
; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3
983983
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
984+
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
985+
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm6
986+
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7
987+
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm8
988+
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
989+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
990+
; AVX512DQ-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
984991
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
985-
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
986-
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3
987-
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm4
988-
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
989-
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm6
990-
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
991-
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
992-
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
993-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
994-
; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
995-
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
992+
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
993+
; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
994+
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
995+
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
996+
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
997+
; AVX512DQ-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
996998
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
997-
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
998-
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
999-
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1000-
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1001-
; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1002-
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8)
1003-
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8)
999+
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1000+
; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1001+
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1002+
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
1003+
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
10041004
; AVX512DQ-NEXT: vzeroupper
10051005
; AVX512DQ-NEXT: retq
10061006
;
@@ -1011,26 +1011,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
10111011
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
10121012
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
10131013
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1014+
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
1015+
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
1016+
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
1017+
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1018+
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1019+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
1020+
; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
10141021
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1015-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
1016-
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
1017-
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
1018-
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5
1019-
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
1020-
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1021-
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1022-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
1023-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
1024-
; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
1025-
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1022+
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
1023+
; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
1024+
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
1025+
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1026+
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
1027+
; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
10261028
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1027-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
1028-
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1029-
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1030-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1031-
; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1032-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
1033-
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
1029+
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1030+
; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1031+
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1032+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1033+
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
10341034
; AVX512DQ-FCP-NEXT: vzeroupper
10351035
; AVX512DQ-FCP-NEXT: retq
10361036
;

0 commit comments

Comments
 (0)