Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59085,20 +59085,22 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
}

// If we are extracting from an insert into a larger vector, replace with a
// smaller insert if we don't access less than the original subvector. Don't
// do this for i1 vectors.
// TODO: Relax the matching indices requirement?
if (VT.getVectorElementType() != MVT::i1 &&
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
IdxVal == InVec.getConstantOperandVal(2) &&
InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
InVec.getOperand(0), N->getOperand(1));
unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
InVec.getOperand(1),
DAG.getVectorIdxConstant(NewIdxVal, DL));
// EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
// --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
// iff SUB is entirely contained in the extraction.
if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
SDValue Src = InVec.getOperand(0);
SDValue Sub = InVec.getOperand(1);
EVT SubVT = Sub.getValueType();
uint64_t InsIdx = InVec.getConstantOperandVal(2);
if (IdxVal <= InsIdx &&
(IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
DAG.getVectorIdxConstant(IdxVal, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
}
}

// If we're extracting an upper subvector from a broadcast we should just
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,13 +993,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%r8), %xmm2
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
Expand Down Expand Up @@ -1035,7 +1035,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
Expand All @@ -1044,6 +1043,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
Expand Down Expand Up @@ -1085,13 +1085,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
Expand Down Expand Up @@ -1127,7 +1127,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
Expand All @@ -1136,6 +1135,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
Expand Down
64 changes: 32 additions & 32 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -740,14 +740,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
Expand All @@ -763,14 +763,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
Expand All @@ -786,14 +786,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
Expand All @@ -809,14 +809,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
Expand All @@ -832,14 +832,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
Expand All @@ -855,14 +855,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
Expand All @@ -878,14 +878,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
Expand All @@ -901,14 +901,14 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx)
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
Expand Down
Loading