diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ad4855d908747..961cab33c579f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2656,7 +2656,9 @@ class BoUpSLP { } // TODO: Check if we can remove a check for non-power-2 number of // scalars after full support of non-power-2 vectorization. - return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size()); + return UniqueValues.size() != 2 && + hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(), + UniqueValues.size()); }; // If the initial strategy fails for any of the operand indexes, then we @@ -5101,12 +5103,13 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, }); }); const unsigned AbsoluteDiff = std::abs(*Diff); - if (IsPossibleStrided && (IsAnyPointerUsedOutGraph || - ((Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - has_single_bit(AbsoluteDiff))) && - AbsoluteDiff > Sz) || - *Diff == -(static_cast(Sz) - 1))) { + if (IsPossibleStrided && + (IsAnyPointerUsedOutGraph || + (AbsoluteDiff > Sz && + (Sz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * Sz && + AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || + *Diff == -(static_cast(Sz) - 1))) { int Stride = *Diff / static_cast(Sz - 1); if (*Diff == Stride * static_cast(Sz - 1)) { Align Alignment = @@ -5192,9 +5195,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, return MaskedGatherCost - GatherCost >= -SLPCostThreshold; // FIXME: The following code has not been updated for non-power-of-2 - // vectors. The splitting logic here does not cover the original - // vector if the vector factor is not a power of two. FIXME - if (!has_single_bit(VL.size())) + // vectors (and not whole registers). The splitting logic here does not + // cover the original vector if the vector factor is not a power of two. + if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size())) return false; unsigned Sz = DL->getTypeSizeInBits(ScalarTy); @@ -5202,7 +5205,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, DemandedElts.clearAllBits(); // Iterate through possible vectorization factors and check if vectorized + // shuffles is better than just gather. - for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { + for (unsigned VF = + getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1); + VF >= MinVF; + VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) { SmallVector States; for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); @@ -7632,8 +7638,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case Instruction::ExtractValue: case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, CurrentOrder); - // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. - if (!has_single_bit(VL.size())) + // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and + // non-full registers). + if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size())) return TreeEntry::NeedToGather; if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; @@ -8089,7 +8096,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) || - !has_single_bit(VL.size())) { + !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); @@ -9840,7 +9847,8 @@ void BoUpSLP::transformNodes() { if (!S || S.isAltShuffle() || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || - (S.getOpcode() != Instruction::Load && !has_single_bit(VF))) + (S.getOpcode() != Instruction::Load && + !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF))) continue; if (VF == 2) { // Try to vectorize reduced values or if all users are vectorized. @@ -13618,8 +13626,9 @@ BoUpSLP::isGatherShuffledEntry( return !TE->isGather(); }))) return {}; - // FIXME: Gathering for non-power-of-2 nodes not implemented yet. - if (TE->isNonPowOf2Vec()) + // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not + // implemented yet. + if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) return {}; Mask.assign(VL.size(), PoisonMaskElem); assert((TE->UserTreeIndices.size() == 1 || @@ -19200,9 +19209,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, } } + Type *ScalarTy = getValueType(VL[0]); unsigned Sz = R.getVectorElementSize(I0); unsigned MinVF = R.getMinVF(Sz); - unsigned MaxVF = std::max(llvm::bit_floor(VL.size()), MinVF); + unsigned MaxVF = std::max( + getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF); MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { R.getORE()->emit([&]() { @@ -19216,10 +19227,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool Changed = false; bool CandidateFound = false; InstructionCost MinCost = SLPCostThreshold.getValue(); - Type *ScalarTy = getValueType(VL[0]); unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; + VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -19234,7 +19245,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, if (MaxVFOnly && ActualVF < MaxVF) break; - if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2)) + if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2)) break; SmallVector Ops(ActualVF, nullptr); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index dadd22217a3e6..02327272f3ab1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,18 +7,14 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[G_228_PROMOTED166_I1105_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 0, i32 15 -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v12i32(<16 x i32> poison, <12 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP8]], <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll index be0ed2c34a365..60b0f758133fb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-loads-non-power-of-2.ll @@ -5,12 +5,7 @@ define <6 x double> @test(ptr %a) { ; CHECK-LABEL: define <6 x double> @test( ; CHECK-SAME: ptr [[A:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[A]], i16 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <6 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <6 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x double> [[TMP3]], <6 x double> [[TMP4]], <6 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = load <6 x double>, ptr [[A]], align 8 ; CHECK-NEXT: ret <6 x double> [[TMP5]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll index aff66dd7c10ea..9fc2b7d6e7865 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll @@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <6 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <6 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> ; CHECK-NEXT: store <6 x double> [[TMP13]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll index 6d22bb06d5e03..70b7f14a3a2c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll @@ -4,11 +4,10 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> , i64 [[XOR108_I_I_I]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> , i64 [[XOR108_I_I_I]], i32 10 +; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> poison, <8 x i64> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v4i64(<16 x i64> [[TMP4]], <4 x i64> [[TMP2]], i64 8) +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v12i64(<16 x i64> poison, <12 x i64> [[TMP2]], i64 0) ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll index bb7964146c44d..d1617c9a382d1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll @@ -8,23 +8,23 @@ define void @test(i32 %arg) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[PHI]], 0 -; CHECK-NEXT: [[ADD18:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI]], 0 ; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[PHI2]], 0 -; CHECK-NEXT: [[ADD23:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[TMP0]], ; CHECK-NEXT: [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD18]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD17]], [[ADD19]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]] +; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD17]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD19]], [[TMP6]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]] ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll index 40568f9c8a509..c30f94159916a 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll @@ -7,50 +7,52 @@ define i1 @test(float %0, double %1) { ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; X86-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; X86-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; X86-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] -; X86-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; X86-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) -; X86-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; X86-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) -; X86-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] -; X86-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] -; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> -; X86-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> -; X86-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer -; X86-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer -; X86-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] -; X86-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) -; X86-NEXT: ret i1 [[TMP22]] +; X86-NEXT: [[TMP5:%.*]] = insertelement <6 x double> , double [[TMP1]], i32 4 +; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> +; X86-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> +; X86-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] +; X86-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> +; X86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> +; X86-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]] +; X86-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP13]], i64 0) +; X86-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v6f64(<8 x double> , <6 x double> [[TMP8]], i64 0) +; X86-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP14]], [[TMP15]] +; X86-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP14]], [[TMP15]] +; X86-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> +; X86-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float> +; X86-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer +; X86-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer +; X86-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]] +; X86-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]]) +; X86-NEXT: ret i1 [[TMP23]] ; ; AARCH64-LABEL: define i1 @test ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; AARCH64-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; AARCH64-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> -; AARCH64-NEXT: [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]] -; AARCH64-NEXT: [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; AARCH64-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; AARCH64-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP12]], i64 0) -; AARCH64-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4) -; AARCH64-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]] -; AARCH64-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]] -; AARCH64-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> -; AARCH64-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float> -; AARCH64-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer -; AARCH64-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer -; AARCH64-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]] -; AARCH64-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]]) -; AARCH64-NEXT: ret i1 [[TMP23]] +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <6 x double> , double [[TMP1]], i32 4 +; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> +; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> +; AARCH64-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> +; AARCH64-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> , <4 x i32> +; AARCH64-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> +; AARCH64-NEXT: [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]] +; AARCH64-NEXT: [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP14]], i64 0) +; AARCH64-NEXT: [[TMP16:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v6f64(<8 x double> , <6 x double> [[TMP8]], i64 0) +; AARCH64-NEXT: [[TMP17:%.*]] = fsub <8 x double> [[TMP15]], [[TMP16]] +; AARCH64-NEXT: [[TMP18:%.*]] = fmul <8 x double> [[TMP15]], [[TMP16]] +; AARCH64-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> +; AARCH64-NEXT: [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float> +; AARCH64-NEXT: [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer +; AARCH64-NEXT: [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer +; AARCH64-NEXT: [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]] +; AARCH64-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]]) +; AARCH64-NEXT: ret i1 [[TMP24]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double