diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index 2eb4fd36c5b7d..11385666e7ff8 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -762,6 +762,12 @@ class Instruction : public User, /// applied to any type. /// LLVM_ABI bool isCommutative() const LLVM_READONLY; + + /// Checks if the operand is commutative. In commutative operations, not all + /// operands might commutable, e.g. for fmuladd only 2 first operands are + /// commutable. + LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY; + static bool isCommutative(unsigned Opcode) { switch (Opcode) { case Add: case FAdd: diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 0622bfae2c845..0b25baa465a71 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -101,6 +101,12 @@ class IntrinsicInst : public CallInst { } } + /// Return true if the operand is commutable. + bool isCommutableOperand(unsigned Op) const { + constexpr unsigned NumCommutativeOps = 2; + return isCommutative() && Op < NumCommutativeOps; + } + /// Checks if the intrinsic is an annotation. bool isAssumeLikeIntrinsic() const { switch (getIntrinsicID()) { diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index f3d4d2424fe5b..7682c28e23b33 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -1293,6 +1293,13 @@ bool Instruction::isCommutative() const { return isCommutative(getOpcode()); } +bool Instruction::isCommutableOperand(unsigned Op) const { + if (auto *II = dyn_cast(this)) + return II->isCommutableOperand(Op); + // TODO: Should allow icmp/fcmp? + return isCommutative(getOpcode()); +} + unsigned Instruction::getNumSuccessors() const { switch (getOpcode()) { #define HANDLE_TERM_INST(N, OPC, CLASS) \ diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 83faa89218bcd..878ab72bc377e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -573,6 +573,27 @@ static bool isCommutative(Instruction *I, Value *ValWithUses, return I->isCommutative(); } +/// Checks if the operand is commutative. In commutative operations, not all +/// operands might commutable, e.g. for fmuladd only 2 first operands are +/// commutable. +static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, + bool IsCopyable = false) { + assert(::isCommutative(I, ValWithUses, IsCopyable) && + "The instruction is not commutative."); + if (isa(I)) + return true; + if (auto *BO = dyn_cast(I)) { + switch (BO->getOpcode()) { + case Instruction::Sub: + case Instruction::FSub: + return true; + default: + break; + } + } + return I->isCommutableOperand(Op); +} + /// This is a helper function to check whether \p I is commutative. /// This is a convenience wrapper that calls the two-parameter version of /// isCommutative with the same instruction for both parameters. This is @@ -5326,13 +5347,14 @@ class slpvectorizer::BoUpSLP { if (ScheduleCopyableDataMap.empty()) return false; SmallDenseMap PotentiallyReorderedEntriesCount; - SmallDenseMap OrderedEntriesCount; ArrayRef Entries = SLP.getTreeEntries(User); if (Entries.empty()) return false; + unsigned CurNumOps = 0; for (const Use &U : User->operands()) { if (U.get() != Op) continue; + ++CurNumOps; // Check all tree entries, if they have operands replaced by copyable // data. for (TreeEntry *TE : Entries) { @@ -5365,27 +5387,43 @@ class slpvectorizer::BoUpSLP { // Same applies even for non-commutative cmps, because we can invert // their predicate potentially and, thus, reorder the operands. bool IsCommutativeUser = - ::isCommutative(User) || - ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); - if (!IsCommutativeUser && !isa(User)) { - unsigned &OpCnt = - OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); + ::isCommutative(User) && + ::isCommutableOperand(User, User, U.getOperandNo()); + if (!IsCommutativeUser) { + Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User); + IsCommutativeUser = + ::isCommutative(MainOp, User) && + ::isCommutableOperand(MainOp, User, U.getOperandNo()); + } + // The commutative user with the same operands can be safely + // considered as non-commutative, operands reordering does not change + // the semantics. + assert( + !IsCommutativeUser || + (((::isCommutative(User) && + ::isCommutableOperand(User, User, 0) && + ::isCommutableOperand(User, User, 1)) || + (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) && + ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User), User, + 0) && + ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User), User, + 1)))) && + "Expected commutative user with 2 first commutable operands"); + bool IsCommutativeWithSameOps = + IsCommutativeUser && User->getOperand(0) == User->getOperand(1); + if ((!IsCommutativeUser || IsCommutativeWithSameOps) && + !isa(User)) { EdgeInfo EI(TE, U.getOperandNo()); - if (!getScheduleCopyableData(EI, Op)) + if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op)) continue; - // Found copyable operand - continue. - OpCnt += Inc; - continue; + return false; } PotentiallyReorderedEntriesCount.try_emplace(TE, 0) .first->getSecond() += Inc; } } if (PotentiallyReorderedEntriesCount.empty()) - return all_of(OrderedEntriesCount, - [&](const std::pair &P) { - return P.second == NumOps; - }); + return true; // Check the commutative/cmp entries. for (auto &P : PotentiallyReorderedEntriesCount) { SmallPtrSet ParentsUniqueUsers; @@ -5431,10 +5469,6 @@ class slpvectorizer::BoUpSLP { return all_of(PotentiallyReorderedEntriesCount, [&](const std::pair &P) { return P.second == NumOps - 1; - }) && - all_of(OrderedEntriesCount, - [&](const std::pair &P) { - return P.second == NumOps; }); } @@ -5656,6 +5690,7 @@ class slpvectorizer::BoUpSLP { } }; + SmallDenseSet> Checked; for (ScheduleBundle *Bundle : Bundles) { if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) break; @@ -5663,7 +5698,6 @@ class slpvectorizer::BoUpSLP { // Need to search for the lane since the tree entry can be // reordered. auto *It = find(Bundle->getTreeEntry()->Scalars, In); - SmallDenseSet> Checked; bool IsNonSchedulableWithParentPhiNode = Bundle->getTreeEntry()->doesNotNeedToSchedule() && Bundle->getTreeEntry()->UserTreeIndex && @@ -10873,7 +10907,9 @@ class InstructionsCompatibilityAnalysis { Opcode == Instruction::LShr || Opcode == Instruction::Shl || Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || Opcode == Instruction::And || Opcode == Instruction::Or || - Opcode == Instruction::Xor; + Opcode == Instruction::Xor || Opcode == Instruction::FAdd || + Opcode == Instruction::FSub || Opcode == Instruction::FMul || + Opcode == Instruction::FDiv; } /// Identifies the best candidate value, which represents main opcode @@ -11214,6 +11250,10 @@ class InstructionsCompatibilityAnalysis { case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::FSub: + case Instruction::FDiv: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index 0783a28f56d85..961662c664a31 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -11,10 +11,10 @@ define void @p(double %0) { ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]] ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll index 0cc4d3db5c537..1abc16da77c8e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll @@ -4,15 +4,16 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> , <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> , <4 x float> , <4 x i32> ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[FMUL:%.*]] = sitofp i32 0 to float +; CHECK-NEXT: [[SITOFP:%.*]] = sitofp i32 0 to float +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[SITOFP]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> , [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> , <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll index d13a8578d1e00..c1cc3f2dfc9e5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -7,36 +7,30 @@ define void @main(i1 %arg) { ; CHECK-LABEL: @main( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] ; CHECK: cond.true: ; CHECK-NEXT: unreachable ; CHECK: cond.end: ; CHECK-NEXT: br label [[INVOKE_CONT:%.*]] ; CHECK: invoke.cont: -; CHECK-NEXT: br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]] +; CHECK-NEXT: br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]] ; CHECK: arrayctor.cont: ; CHECK-NEXT: [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: br label [[FOR_COND36_PREHEADER:%.*]] ; CHECK: for.cond36.preheader: -; CHECK-NEXT: br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]] ; CHECK: cond.false51.us: ; CHECK-NEXT: unreachable ; CHECK: cond.true48.us: -; CHECK-NEXT: br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]] ; CHECK: cond.false66.us: -; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> , [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02) -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr undef, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> , [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8 +; CHECK-NEXT: store <2 x double> , ptr undef, align 8 +; CHECK-NEXT: store <2 x double> , ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8 ; CHECK-NEXT: ret void ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable ; CHECK: for.body42.lr.ph.us: -; CHECK-NEXT: br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]] ; CHECK: _Z5clampd.exit.1: ; CHECK-NEXT: br label [[FOR_COND36_PREHEADER]] ; @@ -96,7 +90,7 @@ _Z5clampd.exit.1: define void @test(i1 %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: ; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: store <2 x double> , ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll index 6d713e83bbf4e..ca65ff88a4b81 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll @@ -9,33 +9,38 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] ; CHECK: [[IF_THEN]]: -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> ; CHECK-NEXT: br label %[[IF_END]] ; CHECK: [[IF_END]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ , %[[ENTRY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> , float [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]] -; CHECK-NEXT: [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]] ; CHECK-NEXT: [[CALL25:%.*]] = load volatile ptr, ptr null, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x float> , <4 x float> [[TMP30]], <4 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = fmul <4 x float> , [[TMP31]] +; CHECK-NEXT: [[TMP26:%.*]] = fadd <4 x float> , [[TMP32]] ; CHECK-NEXT: [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x float> , <4 x float> [[TMP28]], <4 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]] ; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[CALL25]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index 6942df532ae29..91ec61b275205 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -25,8 +25,7 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]] ; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> , double [[I82]], i32 3 -; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], ; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll index a07e617384e09..fd7f0c61b6737 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll @@ -6,14 +6,17 @@ define i1 @test(double %circ_radius, ptr %x) { ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8 -; CHECK-NEXT: [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x double> , [[TMP13]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> , <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll index eb3b183fd49eb..a9baedef3e509 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll @@ -6,17 +6,18 @@ define i1 @test(double %circ_radius, ptr %x, double %0) { ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[X]], align 8 -; CHECK-NEXT: [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> , <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll index 8c684325f8c68..b71dbc49e7478 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll @@ -8,13 +8,11 @@ define void @test(ptr %quat, float %call13) { ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CALL121:%.*]] = load volatile float, ptr null, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00) -; CHECK-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]]) ; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[QUAT]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll index f101991648276..6dc9806da0aa9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll @@ -4,9 +4,7 @@ define float @test() { ; CHECK-LABEL: define float @test() { ; CHECK-NEXT: [[LABEL:.*]]: -; CHECK-NEXT: [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[SUB_I102_I]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> @@ -14,26 +12,12 @@ define float @test() { ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x float> , [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <8 x float> , [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> , <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> , <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fmul <8 x float> , [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x float> , [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> , <12 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x float> , <8 x float> [[TMP19]], <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd <12 x float> , [[TMP17]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]] ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <20 x float> , <20 x float> [[TMP24]], <20 x i32> ; CHECK-NEXT: br label %[[REGION_30:.*]] ; CHECK: [[REGION_30]]: -; CHECK-NEXT: [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7 ; CHECK-NEXT: ret float [[TMP27]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll index 7b298723d93b5..c58c63e51737c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll @@ -11,30 +11,23 @@ define void @test(ptr %this, ptr %0, double %1) { ; CHECK-NEXT: [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[THIS]], align 8 -; CHECK-NEXT: [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00 ; CHECK-NEXT: [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00 ; CHECK-NEXT: [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP2]] -; CHECK-NEXT: [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00) ; CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fneg double [[TMP3]] ; CHECK-NEXT: [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]]) ; CHECK-NEXT: [[FNEG381:%.*]] = fneg double [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00) -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[NEG417:%.*]] = fneg double [[MUL257]] -; CHECK-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00) -; CHECK-NEXT: [[FNEG418:%.*]] = fneg double [[TMP16]] -; CHECK-NEXT: [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]] +; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fneg <2 x double> [[TMP11]] ; CHECK-NEXT: [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00) -; CHECK-NEXT: [[FNEG437:%.*]] = fneg double [[TMP17]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = fneg <2 x double> [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = fneg double [[TMP4]] ; CHECK-NEXT: [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]] ; CHECK-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]]) @@ -42,19 +35,18 @@ define void @test(ptr %this, ptr %0, double %1) { ; CHECK-NEXT: [[FNEG474:%.*]] = fneg double [[TMP20]] ; CHECK-NEXT: [[NEG492:%.*]] = fneg double [[MUL305]] ; CHECK-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00) -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]] +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> , [[TMP13]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00) ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3 -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7 +; CHECK-NEXT: [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]] ; CHECK-NEXT: [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float> ; CHECK-NEXT: store <8 x float> [[TMP35]], ptr [[TMP7]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 2a0e7889f0f34..d10d26671e76b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s define void @add0(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @add0( @@ -336,32 +336,12 @@ entry: } define void @add1f(ptr noalias %dst, ptr noalias %src) { -; NON-POW2-LABEL: @add1f( -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], -; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; NON-POW2-NEXT: ret void -; -; POW2-ONLY-LABEL: @add1f( -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], -; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; POW2-ONLY-NEXT: ret void +; CHECK-LABEL: @add1f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1 @@ -387,18 +367,9 @@ entry: define void @sub0f(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -565,18 +536,9 @@ entry: define void @mulf(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], -; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -631,32 +593,12 @@ entry: } define void @add1fn(ptr noalias %dst, ptr noalias %src) { -; NON-POW2-LABEL: @add1fn( -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], -; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; NON-POW2-NEXT: ret void -; -; POW2-ONLY-LABEL: @add1fn( -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], -; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00 -; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; POW2-ONLY-NEXT: ret void +; CHECK-LABEL: @add1fn( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1 @@ -682,18 +624,9 @@ entry: define void @sub0fn(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], -; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -780,18 +713,9 @@ entry: define void @mulfn(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @mulfn( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], -; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], +; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll index 125c2dce32663..b23da5fa263f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll @@ -52,11 +52,12 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) { ; CHECK-SAME: i16 [[INP:%.*]]) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i16 [[INP]], -10 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i16 [[INP]], 5 -; CHECK-NEXT: [[TMP1:%.*]] = uitofp i16 [[MUL]] to float -; CHECK-NEXT: [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00 -; CHECK-NEXT: [[TMP3:%.*]] = sitofp i16 [[ADD]] to float -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[TMP5]], ; CHECK-NEXT: ret <2 x float> [[R]] ; %add = add nsw i16 %inp, -10 diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll index 793d089404d1e..c79969de6ac41 100644 --- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll @@ -1,52 +1,98 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %} define void @exceed(double %0, double %1) { -; CHECK-LABEL: @exceed( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef -; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef -; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX13:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX14:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX15:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]] -; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef -; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ -; CHECK-NEXT: i32 0, label [[BB2:%.*]] -; CHECK-NEXT: ] -; CHECK: bb1: -; CHECK-NEXT: br label [[LABEL:%.*]] -; CHECK: bb2: -; CHECK-NEXT: br label [[LABEL]] -; CHECK: label: -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ] -; CHECK-NEXT: ret void +; X86-LABEL: @exceed( +; X86-NEXT: entry: +; X86-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 +; X86-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; X86-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 +; X86-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer +; X86-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; X86-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; X86-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; X86-NEXT: [[IXX0:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX1:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX2:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX3:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX4:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX5:%.*]] = fsub double undef, undef +; X86-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; X86-NEXT: [[IXX10:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX11:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX12:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX13:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX14:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX15:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX20:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX21:%.*]] = fsub double undef, undef +; X86-NEXT: [[IXX22:%.*]] = fsub double undef, undef +; X86-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; X86-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] +; X86-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; X86-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> +; X86-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]] +; X86-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]] +; X86-NEXT: [[IXX101:%.*]] = fsub double undef, undef +; X86-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> +; X86-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef +; X86-NEXT: switch i32 undef, label [[BB1:%.*]] [ +; X86-NEXT: i32 0, label [[BB2:%.*]] +; X86-NEXT: ] +; X86: bb1: +; X86-NEXT: br label [[LABEL:%.*]] +; X86: bb2: +; X86-NEXT: br label [[LABEL]] +; X86: label: +; X86-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ] +; X86-NEXT: ret void +; +; AARCH64-LABEL: @exceed( +; AARCH64-NEXT: entry: +; AARCH64-NEXT: [[IXX0:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX1:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX2:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX3:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX4:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX5:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX10:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX11:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX12:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX13:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX14:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX15:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX20:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX21:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[IXX22:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; AARCH64-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 +; AARCH64-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer +; AARCH64-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; AARCH64-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; AARCH64-NEXT: [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]] +; AARCH64-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> , <2 x i32> +; AARCH64-NEXT: [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]] +; AARCH64-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; AARCH64-NEXT: [[IX:%.*]] = fmul double [[TMP12]], undef +; AARCH64-NEXT: [[IX1:%.*]] = fmul double [[TMP12]], undef +; AARCH64-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] +; AARCH64-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]] +; AARCH64-NEXT: [[IXX101:%.*]] = fsub double undef, undef +; AARCH64-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef +; AARCH64-NEXT: switch i32 undef, label [[BB1:%.*]] [ +; AARCH64-NEXT: i32 0, label [[BB2:%.*]] +; AARCH64-NEXT: ] +; AARCH64: bb1: +; AARCH64-NEXT: br label [[LABEL:%.*]] +; AARCH64: bb2: +; AARCH64-NEXT: br label [[LABEL]] +; AARCH64: label: +; AARCH64-NEXT: [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ] +; AARCH64-NEXT: ret void ; entry: %i10 = fdiv fast double %0, %1 diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll index 32e59697486a7..439943102b58a 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll @@ -7,56 +7,52 @@ define i1 @test(float %0, double %1) { ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; X86-NEXT: [[TMP5:%.*]] = insertelement <6 x double> , double [[TMP1]], i32 4 -; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> -; X86-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> -; X86-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] -; X86-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> -; X86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> +; X86-NEXT: [[TMP5:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; X86-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], +; X86-NEXT: [[TMP7:%.*]] = insertelement <8 x double> , double [[TMP1]], i32 4 +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> +; X86-NEXT: [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> +; X86-NEXT: [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]] +; X86-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> ; X86-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]] +; X86-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]] ; X86-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> ; X86-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> -; X86-NEXT: [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> -; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> -; X86-NEXT: [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]] -; X86-NEXT: [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]] -; X86-NEXT: [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> -; X86-NEXT: [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float> -; X86-NEXT: [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer -; X86-NEXT: [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer -; X86-NEXT: [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]] -; X86-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]]) -; X86-NEXT: ret i1 [[TMP25]] +; X86-NEXT: [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]] +; X86-NEXT: [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]] +; X86-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> +; X86-NEXT: [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float> +; X86-NEXT: [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer +; X86-NEXT: [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer +; X86-NEXT: [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]] +; X86-NEXT: [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]]) +; X86-NEXT: ret i1 [[TMP23]] ; ; AARCH64-LABEL: define i1 @test ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; AARCH64-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <6 x double> , double [[TMP1]], i32 4 -; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> -; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> -; AARCH64-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] -; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> -; AARCH64-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> , <4 x i32> -; AARCH64-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; AARCH64-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], +; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <8 x double> , double [[TMP1]], i32 4 +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> +; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> +; AARCH64-NEXT: [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]] +; AARCH64-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> ; AARCH64-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> , <4 x i32> ; AARCH64-NEXT: [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> -; AARCH64-NEXT: [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]] +; AARCH64-NEXT: [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]] ; AARCH64-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> ; AARCH64-NEXT: [[TMP16:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP15]], <8 x i32> -; AARCH64-NEXT: [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> -; AARCH64-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP17]], <8 x i32> -; AARCH64-NEXT: [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]] -; AARCH64-NEXT: [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]] -; AARCH64-NEXT: [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> -; AARCH64-NEXT: [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float> -; AARCH64-NEXT: [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer -; AARCH64-NEXT: [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer -; AARCH64-NEXT: [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]] -; AARCH64-NEXT: [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]]) -; AARCH64-NEXT: ret i1 [[TMP26]] +; AARCH64-NEXT: [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]] +; AARCH64-NEXT: [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]] +; AARCH64-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> +; AARCH64-NEXT: [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float> +; AARCH64-NEXT: [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer +; AARCH64-NEXT: [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer +; AARCH64-NEXT: [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]] +; AARCH64-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]]) +; AARCH64-NEXT: ret i1 [[TMP24]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll index eefc99feebb95..09e3ef41b3dbe 100644 --- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll +++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll @@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) ; X86-LABEL: @test( ; X86-NEXT: entry: ; X86-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 -; X86-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 -; X86-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 -; X86-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 -; X86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] -; X86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 +; X86-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8 +; X86-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; X86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0 ; X86-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 -; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 -; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 -; X86-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; X86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]] -; X86-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 +; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2 +; X86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer +; X86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]] +; X86-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] ; X86-NEXT: [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]] ; X86-NEXT: ret <4 x double> [[TMP7]] ; ; AARCH86-LABEL: @test( ; AARCH86-NEXT: entry: ; AARCH86-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 -; AARCH86-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 -; AARCH86-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 -; AARCH86-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 -; AARCH86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] -; AARCH86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 +; AARCH86-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8 +; AARCH86-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AARCH86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0 ; AARCH86-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 -; AARCH86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 -; AARCH86-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 -; AARCH86-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; AARCH86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]] -; AARCH86-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 +; AARCH86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2 +; AARCH86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer +; AARCH86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]] +; AARCH86-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AARCH86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; AARCH86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> , <4 x i32> +; AARCH86-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] ; AARCH86-NEXT: [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]] ; AARCH86-NEXT: ret <4 x double> [[I1994]] ;