diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 0c49fc86a4232..44a136626b6e3 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -73,6 +73,7 @@ Changes to Vectorizers * Added initial support for copyable elements in SLP, which models copyable elements as add , 0, i.e. uses identity constants for missing lanes. +* SLP vectorizer supports initial recognition of FMA/FMAD pattern Changes to the AArch64 Backend ------------------------------ diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 62ab3f522bb6f..04f8bf4984336 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3883,6 +3883,7 @@ class BoUpSLP { enum CombinedOpcode { NotCombinedOp = -1, MinMax = Instruction::OtherOpsEnd + 1, + FMulAdd, }; CombinedOpcode CombinedOp = NotCombinedOp; @@ -4033,6 +4034,9 @@ class BoUpSLP { /// Returns true if any scalar in the list is a copyable element. bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// Returns the state of the operations. + const InstructionsState &getOperations() const { return S; } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -11987,6 +11991,81 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) { } } +static InstructionCost canConvertToFMA(ArrayRef VL, + const InstructionsState &S, + DominatorTree &DT, const DataLayout &DL, + TargetTransformInfo &TTI, + const TargetLibraryInfo &TLI) { + assert(all_of(VL, + [](Value *V) { + return V->getType()->getScalarType()->isFloatingPointTy(); + }) && + "Can only convert to FMA for floating point types"); + assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub"); + + auto CheckForContractable = [&](ArrayRef VL) { + FastMathFlags FMF; + FMF.set(); + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + // TODO: support for copyable elements. + Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I); + if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI) + continue; + if (auto *FPCI = dyn_cast(I)) + FMF &= FPCI->getFastMathFlags(); + } + return FMF.allowContract(); + }; + if (!CheckForContractable(VL)) + return InstructionCost::getInvalid(); + // fmul also should be contractable + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + SmallVector Operands = Analysis.buildOperands(S, VL); + + InstructionsState OpS = getSameOpcode(Operands.front(), TLI); + if (!OpS.valid()) + return InstructionCost::getInvalid(); + if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul) + return InstructionCost::getInvalid(); + if (!CheckForContractable(Operands.front())) + return InstructionCost::getInvalid(); + // Compare the costs. + InstructionCost FMulPlusFAddCost = 0; + InstructionCost FMACost = 0; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + FastMathFlags FMF; + FMF.set(); + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (auto *FPCI = dyn_cast(I)) + FMF &= FPCI->getFastMathFlags(); + FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind); + } + unsigned NumOps = 0; + for (auto [V, Op] : zip(VL, Operands.front())) { + auto *I = dyn_cast(Op); + if (!I || !I->hasOneUse()) { + FMACost += TTI.getInstructionCost(cast(V), CostKind); + if (I) + FMACost += TTI.getInstructionCost(I, CostKind); + continue; + } + ++NumOps; + if (auto *FPCI = dyn_cast(I)) + FMF &= FPCI->getFastMathFlags(); + FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind); + } + Type *Ty = VL.front()->getType(); + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF); + FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind); + return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid(); +} + void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; BaseGraphSize = VectorizableTree.size(); @@ -12355,6 +12434,25 @@ void BoUpSLP::transformNodes() { } break; } + case Instruction::FSub: + case Instruction::FAdd: { + // Check if possible to convert (a*b)+c to fma. + if (E.State != TreeEntry::Vectorize || + !E.getOperations().isAddSubLikeOp()) + break; + if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI) + .isValid()) + break; + // This node is a fmuladd node. + E.CombinedOp = TreeEntry::FMulAdd; + TreeEntry *FMulEntry = getOperandEntry(&E, 0); + if (FMulEntry->UserTreeIndex && + FMulEntry->State == TreeEntry::Vectorize) { + // The FMul node is part of the combined fmuladd node. + FMulEntry->State = TreeEntry::CombinedVectorize; + } + break; + } default: break; } @@ -13587,6 +13685,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } return IntrinsicCost; }; + auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S, + Instruction *VI) { + InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI); + return Cost; + }; switch (ShuffleOrOp) { case Instruction::PHI: { // Count reused scalars. @@ -13927,6 +14030,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, }; return GetCostDiff(GetScalarCost, GetVectorCost); } + case TreeEntry::FMulAdd: { + auto GetScalarCost = [&](unsigned Idx) { + if (isa(UniqueValues[Idx])) + return InstructionCost(TTI::TCC_Free); + return GetFMulAddCost(E->getOperations(), + cast(UniqueValues[Idx])); + }; + auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) { + FastMathFlags FMF; + FMF.set(); + for (Value *V : E->Scalars) { + if (auto *FPCI = dyn_cast(V)) { + FMF &= FPCI->getFastMathFlags(); + if (auto *FPCIOp = dyn_cast(FPCI->getOperand(0))) + FMF &= FPCIOp->getFastMathFlags(); + } + } + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy, + {VecTy, VecTy, VecTy}, FMF); + InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + return VecCost + CommonCost; + }; + return GetCostDiff(GetScalarCost, GetVectorCost); + } case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -13964,8 +14091,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1); TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2); - return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, - Op1Info, Op2Info, Operands); + InstructionCost ScalarCost = TTI->getArithmeticInstrCost( + ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands); + if (auto *I = dyn_cast(UniqueValues[Idx]); + I && (ShuffleOrOp == Instruction::FAdd || + ShuffleOrOp == Instruction::FSub)) { + InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I); + if (IntrinsicCost.isValid()) + ScalarCost = IntrinsicCost; + } + return ScalarCost; }; auto GetVectorCost = [=](InstructionCost CommonCost) { if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { @@ -22593,11 +22728,21 @@ class HorizontalReduction { /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI) { + const TargetLibraryInfo &TLI, + DominatorTree &DT, TargetTransformInfo &TTI) { RdxKind = HorizontalReduction::getRdxKind(Root); if (!isVectorizable(RdxKind, Root)) return false; + // FMA reduction root - skip. + auto CheckForFMA = [&](Instruction *I) { + return RdxKind == RecurKind::FAdd && + canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI) + .isValid(); + }; + if (CheckForFMA(Root)) + return false; + // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. Type *Ty = Root->getType(); @@ -22635,7 +22780,7 @@ class HorizontalReduction { // Also, do not try to reduce const values, if the operation is not // foldable. if (!EdgeInst || Level > RecursionMaxDepth || - getRdxKind(EdgeInst) != RdxKind || + getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || !isVectorizable(RdxKind, EdgeInst) || @@ -24204,13 +24349,13 @@ bool SLPVectorizerPass::vectorizeHorReduction( Stack.emplace(SelectRoot(), 0); SmallPtrSet VisitedInstrs; bool Res = false; - auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * { + auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx; - if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) + if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI)) return nullptr; return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); }; @@ -24276,6 +24421,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (!isa(I) || isa(I->getType())) return false; + // Skip potential FMA candidates. + if ((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && + canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI) + .isValid()) + return false; Value *P = I->getParent(); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll index 442769937ac12..9e086dcad686c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -8,15 +8,18 @@ target triple = "aarch64--linux-gnu" define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]]) +; CHECK-NEXT: [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float +; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]] +; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]] +; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: @@ -47,15 +50,18 @@ for.end27: define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]]) +; CHECK-NEXT: [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float +; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]] +; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]] +; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 295a71899c338..2e684320ba10e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -12,7 +12,8 @@ define void @test() { ; CHECK: [[BB63]]: ; CHECK-NEXT: br label %[[BB64]] ; CHECK: [[BB64]]: -; CHECK-NEXT: [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] +; CHECK-NEXT: [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] +; CHECK-NEXT: [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] ; CHECK-NEXT: [[I66:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I67:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I68:%.*]] = load float, ptr poison, align 8 @@ -24,57 +25,125 @@ define void @test() { ; CHECK-NEXT: [[I74:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I75:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I76:%.*]] = load float, ptr poison, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]] ; CHECK: [[BB77]]: -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: -; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]] -; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison -; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison -; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I87:%.*]] = fmul fast float [[I85]], poison +; CHECK-NEXT: [[I88:%.*]] = fmul fast float [[I80]], poison +; CHECK-NEXT: [[I89:%.*]] = fmul fast float [[I81]], poison +; CHECK-NEXT: [[I90:%.*]] = fmul fast float [[I82]], poison +; CHECK-NEXT: [[I91:%.*]] = fmul fast float [[I84]], poison +; CHECK-NEXT: [[I92:%.*]] = fadd fast float [[I91]], [[I87]] +; CHECK-NEXT: [[I93:%.*]] = fmul fast float [[I127]], poison +; CHECK-NEXT: [[I94:%.*]] = fadd fast float [[I93]], [[I88]] +; CHECK-NEXT: [[I95:%.*]] = fmul fast float [[I131]], poison +; CHECK-NEXT: [[I96:%.*]] = fadd fast float [[I95]], [[I89]] +; CHECK-NEXT: [[I97:%.*]] = fmul fast float [[I86]], poison +; CHECK-NEXT: [[I98:%.*]] = fadd fast float [[I97]], [[I90]] +; CHECK-NEXT: [[I99:%.*]] = fadd fast float [[I92]], poison +; CHECK-NEXT: [[I100:%.*]] = fadd fast float [[I94]], poison +; CHECK-NEXT: [[I101:%.*]] = fadd fast float [[I96]], poison +; CHECK-NEXT: [[I102:%.*]] = fadd fast float [[I98]], poison +; CHECK-NEXT: [[I103]] = fadd fast float [[I99]], poison +; CHECK-NEXT: [[I104]] = fadd fast float [[I100]], poison +; CHECK-NEXT: [[I105]] = fadd fast float [[I101]], poison +; CHECK-NEXT: [[I106]] = fadd fast float [[I102]], poison +; CHECK-NEXT: [[I107:%.*]] = fmul fast float [[I85]], poison +; CHECK-NEXT: [[I108:%.*]] = fmul fast float [[I80]], poison +; CHECK-NEXT: [[I109:%.*]] = fmul fast float [[I81]], poison +; CHECK-NEXT: [[I110:%.*]] = fmul fast float [[I82]], poison +; CHECK-NEXT: [[I111:%.*]] = fmul fast float [[I84]], poison +; CHECK-NEXT: [[I112:%.*]] = fadd fast float [[I111]], [[I107]] +; CHECK-NEXT: [[I113:%.*]] = fmul fast float [[I127]], poison +; CHECK-NEXT: [[I114:%.*]] = fadd fast float [[I113]], [[I108]] +; CHECK-NEXT: [[I115:%.*]] = fmul fast float [[I131]], poison +; CHECK-NEXT: [[I116:%.*]] = fadd fast float [[I115]], [[I109]] +; CHECK-NEXT: [[I117:%.*]] = fmul fast float [[I86]], poison +; CHECK-NEXT: [[I118:%.*]] = fadd fast float [[I117]], [[I110]] +; CHECK-NEXT: [[I119:%.*]] = fadd fast float [[I112]], poison +; CHECK-NEXT: [[I120:%.*]] = fadd fast float [[I114]], poison +; CHECK-NEXT: [[I121:%.*]] = fadd fast float [[I116]], poison +; CHECK-NEXT: [[I122:%.*]] = fadd fast float [[I118]], poison +; CHECK-NEXT: [[I123]] = fadd fast float [[I119]], poison +; CHECK-NEXT: [[I124]] = fadd fast float [[I120]], poison +; CHECK-NEXT: [[I125]] = fadd fast float [[I121]], poison +; CHECK-NEXT: [[I126]] = fadd fast float [[I122]], poison +; CHECK-NEXT: [[I135:%.*]] = fmul fast float [[I85]], [[I65]] +; CHECK-NEXT: [[I128:%.*]] = fmul fast float [[I80]], [[I65]] +; CHECK-NEXT: [[I129:%.*]] = fmul fast float [[I81]], [[I65]] +; CHECK-NEXT: [[I130:%.*]] = fmul fast float [[I82]], [[I65]] +; CHECK-NEXT: [[I133:%.*]] = fmul fast float [[I84]], [[I77]] +; CHECK-NEXT: [[I134:%.*]] = fadd fast float [[I133]], [[I135]] +; CHECK-NEXT: [[I136:%.*]] = fmul fast float [[I127]], [[I77]] +; CHECK-NEXT: [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]] +; CHECK-NEXT: [[I138:%.*]] = fmul fast float [[I131]], [[I77]] +; CHECK-NEXT: [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]] +; CHECK-NEXT: [[I137:%.*]] = fmul fast float [[I86]], [[I77]] +; CHECK-NEXT: [[I139:%.*]] = fadd fast float [[I137]], [[I130]] +; CHECK-NEXT: [[I140:%.*]] = fadd fast float [[I134]], poison +; CHECK-NEXT: [[I141:%.*]] = fadd fast float [[TMP51]], poison +; CHECK-NEXT: [[I142:%.*]] = fadd fast float [[TMP52]], poison +; CHECK-NEXT: [[I143:%.*]] = fadd fast float [[I139]], poison +; CHECK-NEXT: [[I144:%.*]] = fadd fast float [[I140]], poison +; CHECK-NEXT: [[I145:%.*]] = fadd fast float [[I141]], poison +; CHECK-NEXT: [[I146:%.*]] = fadd fast float [[I142]], poison +; CHECK-NEXT: [[I152:%.*]] = fadd fast float [[I143]], poison +; CHECK-NEXT: [[I147:%.*]] = fmul fast float [[I85]], poison +; CHECK-NEXT: [[I148:%.*]] = fmul fast float [[I80]], poison +; CHECK-NEXT: [[I149:%.*]] = fmul fast float [[I81]], poison +; CHECK-NEXT: [[I150:%.*]] = fmul fast float [[I82]], poison +; CHECK-NEXT: [[I151:%.*]] = fmul fast float [[I84]], poison +; CHECK-NEXT: [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]] +; CHECK-NEXT: [[I153:%.*]] = fmul fast float [[I127]], poison +; CHECK-NEXT: [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]] +; CHECK-NEXT: [[I155:%.*]] = fmul fast float [[I131]], poison +; CHECK-NEXT: [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]] +; CHECK-NEXT: [[I157:%.*]] = fmul fast float [[I86]], poison +; CHECK-NEXT: [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]] +; CHECK-NEXT: [[I159:%.*]] = fadd fast float [[TMP57]], poison +; CHECK-NEXT: [[I160:%.*]] = fadd fast float [[TMP58]], poison +; CHECK-NEXT: [[I161:%.*]] = fadd fast float [[TMP59]], poison +; CHECK-NEXT: [[I162:%.*]] = fadd fast float [[TMP60]], poison +; CHECK-NEXT: [[I163:%.*]] = fadd fast float [[I159]], poison +; CHECK-NEXT: [[I164:%.*]] = fadd fast float [[I160]], poison +; CHECK-NEXT: [[I165:%.*]] = fadd fast float [[I161]], poison +; CHECK-NEXT: [[I166:%.*]] = fadd fast float [[I162]], poison ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: -; CHECK-NEXT: [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x float> [[TMP32]], i32 14 +; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ] +; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ] +; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ] +; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ] +; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ] +; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ] +; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ] +; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ] +; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ] +; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ] +; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ] +; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ] +; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ] +; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ] ; CHECK-NEXT: store float [[TMP33]], ptr poison, align 1 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x float> [[TMP32]], i32 13 ; CHECK-NEXT: store float [[TMP34]], ptr poison, align 1 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x float> [[TMP32]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]] ; CHECK: [[BB184]]: ; CHECK-NEXT: br label %[[BB185:.*]] ; CHECK: [[BB185]]: ; CHECK-NEXT: br i1 poison, label %[[BB185]], label %[[BB186]] ; CHECK: [[BB186]]: -; CHECK-NEXT: [[I187:%.*]] = phi nsz float [ [[TMP35]], %[[BB167]] ], [ poison, %[[BB185]] ] +; CHECK-NEXT: [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 64bdcf28af550..8093285ad8717 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -8,35 +8,56 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[ARG3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], +; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 +; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 +; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 +; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2 +; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] +; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] +; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] ; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3 ; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ] +; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] +; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] +; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] +; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float> -; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]]) +; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float +; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 +; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 +; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float +; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 +; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 +; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float +; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 +; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 +; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float +; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] +; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] +; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] +; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] +; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] +; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] +; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] +; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] +; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] +; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] +; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) ; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 27de36e601512..430a46beace9a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -600,29 +600,25 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { } define float @dot_product_fp32(ptr %a, ptr %b) { -; NON-POW2-LABEL: @dot_product_fp32( -; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] -; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) -; NON-POW2-NEXT: ret float [[TMP4]] -; -; POW2-ONLY-LABEL: @dot_product_fp32( -; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret float [[ADD_1]] +; CHECK-LABEL: @dot_product_fp32( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 +; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 +; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 @@ -650,29 +646,25 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define float @dot_product_fp32_reorder(ptr %a, ptr %b) { -; NON-POW2-LABEL: @dot_product_fp32_reorder( -; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] -; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) -; NON-POW2-NEXT: ret float [[TMP4]] -; -; POW2-ONLY-LABEL: @dot_product_fp32_reorder( -; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]] -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]]) -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret float [[ADD_1]] +; CHECK-LABEL: @dot_product_fp32_reorder( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 +; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 +; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 @@ -699,29 +691,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) { define double @dot_product_fp64(ptr %a, ptr %b) { -; NON-POW2-LABEL: @dot_product_fp64( -; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]] -; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) -; NON-POW2-NEXT: ret double [[TMP4]] -; -; POW2-ONLY-LABEL: @dot_product_fp64( -; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 -; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 -; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]]) -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret double [[ADD_1]] +; CHECK-LABEL: @dot_product_fp64( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1 +; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1 +; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret double [[ADD_1]] ; %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0 %l.a.0 = load double, ptr %gep.a.0, align 4 @@ -778,21 +766,13 @@ entry: } define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) { -; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec( -; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01) -; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]]) -; NON-POW2-NEXT: ret float [[TMP5]] -; -; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec( -; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret float [[ADD_1]] +; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec( +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] ; %mul.0 = fmul fast float %a, 10.0 %mul.1 = fmul fast float %b, 10.0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index 4a8af6d03da06..0879ec239e287 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -2,7 +2,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2 ; ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3])) @@ -95,12 +95,47 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt } define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { -; CHECK-LABEL: @dot4f64_fast( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) -; CHECK-NEXT: ret double [[TMP4]] +; SSE2-LABEL: @dot4f64_fast( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; SSE2-NEXT: ret double [[TMP4]] +; +; SSE4-LABEL: @dot4f64_fast( +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; SSE4-NEXT: ret double [[TMP4]] +; +; AVX-LABEL: @dot4f64_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; AVX-NEXT: ret double [[TMP4]] +; +; AVX2-LABEL: @dot4f64_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2 +; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2 +; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[X1:%.*]] = load double, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX2-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]] +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 +; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX2-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]] +; AVX2-NEXT: ret double [[DOT0123]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -127,12 +162,47 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 } define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; CHECK-LABEL: @dot4f32_fast( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; CHECK-NEXT: ret float [[TMP4]] +; SSE2-LABEL: @dot4f32_fast( +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; SSE2-NEXT: ret float [[TMP4]] +; +; SSE4-LABEL: @dot4f32_fast( +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; SSE4-NEXT: ret float [[TMP4]] +; +; AVX-LABEL: @dot4f32_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; AVX-NEXT: ret float [[TMP4]] +; +; AVX2-LABEL: @dot4f32_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2 +; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2 +; AVX2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[X1:%.*]] = load float, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; AVX2-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]] +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 +; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX2-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]] +; AVX2-NEXT: ret float [[DOT0123]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 @@ -371,6 +441,18 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 ; AVX-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]] ; AVX-NEXT: ret double [[DOT01]] +; +; AVX2-LABEL: @dot2f64_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[X1:%.*]] = load double, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX2-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]] +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]] +; AVX2-NEXT: ret double [[DOT01]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -409,6 +491,18 @@ define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 ; AVX-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; AVX-NEXT: ret float [[DOT01]] +; +; AVX2-LABEL: @dot2f32_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[X1:%.*]] = load float, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; AVX2-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]] +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]] +; AVX2-NEXT: ret float [[DOT01]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index eaa77d74f8df1..0bbdeb55e1516 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -31,12 +31,9 @@ define float @baz() { ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], 2.000000e+00 ; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1 -; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00) -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 ; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] ; THRESHOLD-NEXT: store float [[OP_RDX]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[OP_RDX]] @@ -76,14 +73,41 @@ define float @bazz() { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] +; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16 +; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4 +; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8 +; CHECK-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4 +; CHECK-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]] ; CHECK-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; CHECK-NEXT: ret float [[OP_RDX1]] ; @@ -92,14 +116,41 @@ define float @bazz() { ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] ; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] +; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16 +; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16 +; THRESHOLD-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4 +; THRESHOLD-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4 +; THRESHOLD-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]] +; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]] +; THRESHOLD-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8 +; THRESHOLD-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8 +; THRESHOLD-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]] +; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]] +; THRESHOLD-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4 +; THRESHOLD-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4 +; THRESHOLD-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]] ; THRESHOLD-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; @@ -151,10 +202,21 @@ define float @bazzz() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: store float [[TMP5]], ptr @res, align 4 ; CHECK-NEXT: ret float [[TMP5]] @@ -163,10 +225,21 @@ define float @bazzz() { ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] +; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: store float [[TMP5]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[TMP5]] @@ -199,10 +272,21 @@ define i32 @foo() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; CHECK-NEXT: store i32 [[CONV4]], ptr @n, align 4 @@ -212,10 +296,21 @@ define i32 @foo() { ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] +; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 +; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 +; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; THRESHOLD-NEXT: store i32 [[CONV4]], ptr @n, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index 1922e935cee4b..45279296d296a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -10,17 +10,65 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> -; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison) -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 +; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8 +; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8 +; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]] +; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16 +; CHECK-NEXT: [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8 +; CHECK-NEXT: [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]] +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3 +; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8 +; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1 +; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8 +; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]] +; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]] +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17 +; CHECK-NEXT: [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8 +; CHECK-NEXT: [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]] +; CHECK-NEXT: [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]] +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5 +; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8 +; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2 +; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18 +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7 +; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8 +; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9 +; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8 +; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11 +; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13 +; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8 +; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6 +; CHECK-NEXT: [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8 +; CHECK-NEXT: [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]] +; CHECK-NEXT: [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22 +; CHECK-NEXT: [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8 +; CHECK-NEXT: [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]] +; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15 +; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8 +; CHECK-NEXT: [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7 +; CHECK-NEXT: [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8 +; CHECK-NEXT: [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]] +; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]] +; CHECK-NEXT: [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23 +; CHECK-NEXT: [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8 +; CHECK-NEXT: [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]] +; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 ; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll index f0272d591f0c3..33c281d3f0166 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll @@ -6,9 +6,25 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @rdx_feeds_single_insert( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]]) +; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8 +; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01 +; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1 +; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8 +; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01 +; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]] +; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], +; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll index 8c9f8b5868d4f..359c24b00e92e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -1,27 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK -; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK -; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4 +; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX ; This test checks for a case when a horizontal reduction of floating-point ; adds may look profitable, but is not because it eliminates generation of ; floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today. - define void @hr() { -; CHECK-LABEL: @hr( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] -; CHECK: exit: -; CHECK-NEXT: ret void +; SSE4-LABEL: @hr( +; SSE4-NEXT: br label [[LOOP:%.*]] +; SSE4: loop: +; SSE4-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; SSE4-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double +; SSE4-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 +; SSE4-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] +; SSE4-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) +; SSE4-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; SSE4-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] +; SSE4: exit: +; SSE4-NEXT: ret void +; +; AVX-LABEL: @hr( +; AVX-NEXT: br label [[LOOP:%.*]] +; AVX: loop: +; AVX-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ] +; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double +; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]] +; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]] +; AVX-NEXT: [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]] +; AVX-NEXT: [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]] +; AVX-NEXT: [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]] +; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] +; AVX: exit: +; AVX-NEXT: ret void ; br label %loop @@ -47,18 +59,27 @@ exit: ; may look profitable; but both are not because this eliminates generation ; of floating-point FMAs that would be more profitable. -; FIXME: We generate a horizontal reduction today, and if that's disabled, we -; still vectorize some of the multiplies. - define double @hr_or_mul() { -; CHECK-LABEL: @hr_or_mul( -; CHECK-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]] -; CHECK-NEXT: ret double [[OP_RDX]] +; SSE4-LABEL: @hr_or_mul( +; SSE4-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double +; SSE4-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]] +; SSE4-NEXT: ret double [[OP_RDX]] +; +; AVX-LABEL: @hr_or_mul( +; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double +; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]] +; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]] +; AVX-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]] +; AVX-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]] +; AVX-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]] +; AVX-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]] +; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]] +; AVX-NEXT: ret double [[ADD3]] ; %cvt0 = uitofp i16 3 to double %mul0 = fmul fast double 7.000000e+00, %cvt0 diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll index a64075db37ba1..5fe02cbc84872 100644 --- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll @@ -1,32 +1,57 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %} define void @test() { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[BODY:%.*]] -; CHECK: body: -; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> , double [[PHI1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> , [[TMP8]] -; CHECK-NEXT: [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]]) -; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 -; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]] -; CHECK: if.then135.i: -; CHECK-NEXT: [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> , <2 x i1> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer -; CHECK-NEXT: br label [[IF_END209_I]] -; CHECK: if.end209.i: -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] -; CHECK-NEXT: ret void +; X86-LABEL: @test( +; X86-NEXT: entry: +; X86-NEXT: br label [[BODY:%.*]] +; X86: body: +; X86-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] +; X86-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ] +; X86-NEXT: [[TMP1:%.*]] = insertelement <2 x double> , double [[PHI1]], i32 0 +; X86-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> , [[TMP1]] +; X86-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP2]]) +; X86-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[TMP3]], 0.000000e+00 +; X86-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] +; X86: exit: +; X86-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]] +; X86: if.then135.i: +; X86-NEXT: [[TMP4:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer +; X86-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> , <2 x i1> [[TMP4]], <2 x i32> +; X86-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> zeroinitializer +; X86-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP6]] +; X86-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], zeroinitializer +; X86-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], zeroinitializer +; X86-NEXT: br label [[IF_END209_I]] +; X86: if.end209.i: +; X86-NEXT: [[TMP10:%.*]] = phi <2 x double> [ [[TMP9]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] +; X86-NEXT: ret void +; +; AARCH64-LABEL: @test( +; AARCH64-NEXT: entry: +; AARCH64-NEXT: br label [[BODY:%.*]] +; AARCH64: body: +; AARCH64-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] +; AARCH64-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ] +; AARCH64-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00 +; AARCH64-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00 +; AARCH64-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]] +; AARCH64-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 +; AARCH64-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] +; AARCH64: exit: +; AARCH64-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]] +; AARCH64: if.then135.i: +; AARCH64-NEXT: [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer +; AARCH64-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> , <2 x i1> [[TMP1]], <2 x i32> +; AARCH64-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer +; AARCH64-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]] +; AARCH64-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer +; AARCH64-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer +; AARCH64-NEXT: br label [[IF_END209_I]] +; AARCH64: if.end209.i: +; AARCH64-NEXT: [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] +; AARCH64-NEXT: ret void ; entry: br label %body diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll index 1e4b598d9fe92..b5d74f0b91ab8 100644 --- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll +++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll @@ -1,24 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH86 %} define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 -; CHECK-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 -; CHECK-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 -; CHECK-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 -; CHECK-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: ret <4 x double> [[TMP7]] +; X86-LABEL: @test( +; X86-NEXT: entry: +; X86-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 +; X86-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 +; X86-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 +; X86-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 +; X86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] +; X86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 +; X86-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 +; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 +; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 +; X86-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer +; X86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]] +; X86-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 +; X86-NEXT: [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]] +; X86-NEXT: ret <4 x double> [[TMP7]] +; +; AARCH86-LABEL: @test( +; AARCH86-NEXT: entry: +; AARCH86-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 +; AARCH86-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 +; AARCH86-NEXT: [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]] +; AARCH86-NEXT: [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]] +; AARCH86-NEXT: [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00 +; AARCH86-NEXT: [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]] +; AARCH86-NEXT: [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00 +; AARCH86-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]] +; AARCH86-NEXT: [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00 +; AARCH86-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 +; AARCH86-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 +; AARCH86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]] +; AARCH86-NEXT: [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]] +; AARCH86-NEXT: [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0 +; AARCH86-NEXT: [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1 +; AARCH86-NEXT: [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2 +; AARCH86-NEXT: [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3 +; AARCH86-NEXT: ret <4 x double> [[I1994]] ; entry: %i1771 = getelementptr inbounds double, ptr %p2, i64 54