diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e51d15a4d3825..3d5ef1fc28ec8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59284,36 +59284,34 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { SDValue ExtSrc = SubVec.getOperand(0); int ExtIdxVal = SubVec.getConstantOperandVal(1); - if (ExtIdxVal != 0) { - SmallVector Mask(VecNumElts); - // First create an identity shuffle mask. - for (int i = 0; i != VecNumElts; ++i) - Mask[i] = i; - // Now insert the extracted portion. - for (int i = 0; i != SubVecNumElts; ++i) - Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; + // Create a shuffle mask matching the extraction and insertion. + SmallVector Mask(VecNumElts); + std::iota(Mask.begin(), Mask.end(), 0); + std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts, + ExtIdxVal + VecNumElts); + if (ExtIdxVal != 0) return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask); - } - // If we're broadcasting, see if we can use a blend instead of - // extract/insert pair. Ensure that the subvector is aligned with the - // insertion/extractions. - if ((ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 && - (ExtSrc.getOpcode() == X86ISD::VBROADCAST || - ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD || - (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast(ExtSrc)->getMemoryVT() == SubVecVT))) { + // See if we can use a blend instead of extract/insert pair. + SmallVector BlendMask(VecNumElts); + std::iota(BlendMask.begin(), BlendMask.end(), 0); + std::iota(BlendMask.begin() + IdxVal, + BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal); + if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) && + VecNumElts == (2 * SubVecNumElts)) { + assert((IdxVal == 0 || IdxVal == SubVecNumElts) && + "Unaligned subvector insertion"); if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) { - uint64_t BlendMask = IdxVal == 0 ? 0x0F : 0xF0; SDValue Blend = DAG.getNode( X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec), DAG.getBitcast(MVT::v8f32, ExtSrc), - DAG.getTargetConstant(BlendMask, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8)); return DAG.getBitcast(OpVT, Blend); } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) { - SDValue Lo = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? ExtSrc : Vec); - SDValue Hi = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? Vec : ExtSrc); + MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64; + SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec); + SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc); SDValue Shuffle = - DAG.getNode(X86ISD::SHUF128, dl, MVT::v8f64, Lo, Hi, + DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi, getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG)); return DAG.getBitcast(OpVT, Shuffle); } diff --git a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll index 9b35857804022..0717a01ac2abc 100644 --- a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll @@ -7,9 +7,8 @@ define void @insert_subvector_broadcast_as_blend() { ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: incq %rax ; CHECK-NEXT: vpbroadcastq %rax, %zmm0 -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm0[0,1,2,3,4,5,6] ; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 ; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll index c3700189d3d0e..f8cde4cf223a7 100644 --- a/llvm/test/CodeGen/X86/widen_fadd.ll +++ b/llvm/test/CodeGen/X86/widen_fadd.ll @@ -221,7 +221,7 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll index 2d9e3f60bee46..fdf895921ca67 100644 --- a/llvm/test/CodeGen/X86/widen_fdiv.ll +++ b/llvm/test/CodeGen/X86/widen_fdiv.ll @@ -182,7 +182,7 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll index 6c3e0ff5a9bcd..16baa068fc24f 100644 --- a/llvm/test/CodeGen/X86/widen_fmul.ll +++ b/llvm/test/CodeGen/X86/widen_fmul.ll @@ -221,7 +221,7 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll index 7405d9b7b1c65..8dcd887ab4144 100644 --- a/llvm/test/CodeGen/X86/widen_fsub.ll +++ b/llvm/test/CodeGen/X86/widen_fsub.ll @@ -221,7 +221,7 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovupd %zmm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq