@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
987987 InterleavedAccessInfo &IAI)
988988 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989989 TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990- Hints(Hints), InterleaveInfo(IAI) {}
990+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
991991
992992 // / \return An upper bound for the vectorization factors (both fixed and
993993 // / scalable). If the factors are 0, vectorization and interleaving should be
@@ -1555,9 +1555,9 @@ class LoopVectorizationCostModel {
15551555
15561556 // / Return the cost of instructions in an inloop reduction pattern, if I is
15571557 // / part of that pattern.
1558- std::optional<InstructionCost>
1559- getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy ,
1560- TTI::TargetCostKind CostKind ) const ;
1558+ std::optional<InstructionCost> getReductionPatternCost (Instruction *I,
1559+ ElementCount VF ,
1560+ Type *VectorTy ) const ;
15611561
15621562 // / Returns true if \p Op should be considered invariant and if it is
15631563 // / trivially hoistable.
@@ -1616,8 +1616,8 @@ class LoopVectorizationCostModel {
16161616
16171617 // / Estimate the overhead of scalarizing an instruction. This is a
16181618 // / convenience wrapper for the type-based getScalarizationOverhead API.
1619- InstructionCost getScalarizationOverhead (Instruction *I, ElementCount VF,
1620- TTI::TargetCostKind CostKind ) const ;
1619+ InstructionCost getScalarizationOverhead (Instruction *I,
1620+ ElementCount VF ) const ;
16211621
16221622 // / Returns true if an artificially high cost for emulated masked memrefs
16231623 // / should be used.
@@ -1798,6 +1798,9 @@ class LoopVectorizationCostModel {
17981798
17991799 // / All element types found in the loop.
18001800 SmallPtrSet<Type *, 16 > ElementTypesInLoop;
1801+
1802+ // / The kind of cost that we are calculating
1803+ TTI::TargetCostKind CostKind;
18011804};
18021805} // end namespace llvm
18031806
@@ -1838,13 +1841,17 @@ class GeneratedRTChecks {
18381841
18391842 PredicatedScalarEvolution &PSE;
18401843
1844+ // / The kind of cost that we are calculating
1845+ TTI::TargetCostKind CostKind;
1846+
18411847public:
18421848 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
18431849 LoopInfo *LI, TargetTransformInfo *TTI,
1844- const DataLayout &DL, bool AddBranchWeights)
1850+ const DataLayout &DL, bool AddBranchWeights,
1851+ TTI::TargetCostKind CostKind)
18451852 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, " scev.check" ),
18461853 MemCheckExp (*PSE.getSE(), DL, "scev.check"),
1847- AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1854+ AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
18481855
18491856 // / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18501857 // / accurately estimate the cost of the runtime checks. The blocks are
@@ -1956,8 +1963,7 @@ class GeneratedRTChecks {
19561963 for (Instruction &I : *SCEVCheckBlock) {
19571964 if (SCEVCheckBlock->getTerminator () == &I)
19581965 continue ;
1959- InstructionCost C =
1960- TTI->getInstructionCost (&I, TTI::TCK_RecipThroughput);
1966+ InstructionCost C = TTI->getInstructionCost (&I, CostKind);
19611967 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19621968 RTCheckCost += C;
19631969 }
@@ -1966,8 +1972,7 @@ class GeneratedRTChecks {
19661972 for (Instruction &I : *MemCheckBlock) {
19671973 if (MemCheckBlock->getTerminator () == &I)
19681974 continue ;
1969- InstructionCost C =
1970- TTI->getInstructionCost (&I, TTI::TCK_RecipThroughput);
1975+ InstructionCost C = TTI->getInstructionCost (&I, CostKind);
19711976 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19721977 MemCheckCost += C;
19731978 }
@@ -2928,10 +2933,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
29282933 if (!VF.isScalar ())
29292934 return CallWideningDecisions.at (std::make_pair (CI, VF)).Cost ;
29302935
2931- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
29322936 Type *RetTy = CI->getType ();
29332937 if (RecurrenceDescriptor::isFMulAddIntrinsic (CI))
2934- if (auto RedCost = getReductionPatternCost (CI, VF, RetTy, CostKind ))
2938+ if (auto RedCost = getReductionPatternCost (CI, VF, RetTy))
29352939 return *RedCost;
29362940
29372941 SmallVector<Type *, 4 > Tys;
@@ -2974,8 +2978,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29742978
29752979 IntrinsicCostAttributes CostAttrs (ID, RetTy, Arguments, ParamTys, FMF,
29762980 dyn_cast<IntrinsicInst>(CI));
2977- return TTI.getIntrinsicInstrCost (CostAttrs,
2978- TargetTransformInfo::TCK_RecipThroughput);
2981+ return TTI.getIntrinsicInstrCost (CostAttrs, CostKind);
29792982}
29802983
29812984void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
@@ -3432,8 +3435,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
34323435 I->getOpcode () == Instruction::URem);
34333436 assert (!isSafeToSpeculativelyExecute (I));
34343437
3435- const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3436-
34373438 // Scalarization isn't legal for scalable vector types
34383439 InstructionCost ScalarizationCost = InstructionCost::getInvalid ();
34393440 if (!VF.isScalable ()) {
@@ -3455,7 +3456,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
34553456
34563457 // The cost of insertelement and extractelement instructions needed for
34573458 // scalarization.
3458- ScalarizationCost += getScalarizationOverhead (I, VF, CostKind );
3459+ ScalarizationCost += getScalarizationOverhead (I, VF);
34593460
34603461 // Scale the cost by the probability of executing the predicated blocks.
34613462 // This assumes the predicated block for each vector lane is equally
@@ -4445,7 +4446,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
44454446 for (const auto &Plan : VPlans) {
44464447 for (ElementCount VF : Plan->vectorFactors ()) {
44474448 VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (),
4448- CM);
4449+ CM, CM. CostKind );
44494450 precomputeCosts (*Plan, VF, CostCtx);
44504451 auto Iter = vp_depth_first_deep (Plan->getVectorLoopRegion ()->getEntry ());
44514452 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5595,7 +5596,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55955596
55965597 // Compute the scalarization overhead of needed insertelement instructions
55975598 // and phi nodes.
5598- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
55995599 if (isScalarWithPredication (I, VF) && !I->getType ()->isVoidTy ()) {
56005600 ScalarCost += TTI.getScalarizationOverhead (
56015601 cast<VectorType>(toVectorTy (I->getType (), VF)),
@@ -5742,15 +5742,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
57425742
57435743 // Don't pass *I here, since it is scalar but will actually be part of a
57445744 // vectorized loop where the user of it is a vectorized instruction.
5745- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
57465745 const Align Alignment = getLoadStoreAlignment (I);
57475746 Cost += VF.getKnownMinValue () * TTI.getMemoryOpCost (I->getOpcode (),
57485747 ValTy->getScalarType (),
57495748 Alignment, AS, CostKind);
57505749
57515750 // Get the overhead of the extractelement and insertelement instructions
57525751 // we might create due to scalarization.
5753- Cost += getScalarizationOverhead (I, VF, CostKind );
5752+ Cost += getScalarizationOverhead (I, VF);
57545753
57555754 // If we have a predicated load/store, it will need extra i1 extracts and
57565755 // conditional branches, but may not be executed for each vector lane. Scale
@@ -5783,7 +5782,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
57835782 Value *Ptr = getLoadStorePointerOperand (I);
57845783 unsigned AS = getLoadStoreAddressSpace (I);
57855784 int ConsecutiveStride = Legal->isConsecutivePtr (ValTy, Ptr);
5786- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
57875785
57885786 assert ((ConsecutiveStride == 1 || ConsecutiveStride == -1 ) &&
57895787 " Stride should be 1 or -1 for consecutive memory access" );
@@ -5814,12 +5812,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
58145812 auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
58155813 const Align Alignment = getLoadStoreAlignment (I);
58165814 unsigned AS = getLoadStoreAddressSpace (I);
5817- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
58185815 if (isa<LoadInst>(I)) {
58195816 return TTI.getAddressComputationCost (ValTy) +
58205817 TTI.getMemoryOpCost (Instruction::Load, ValTy, Alignment, AS,
58215818 CostKind) +
5822- TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VectorTy);
5819+ TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VectorTy, {},
5820+ CostKind);
58235821 }
58245822 StoreInst *SI = cast<StoreInst>(I);
58255823
@@ -5842,9 +5840,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
58425840 const Value *Ptr = getLoadStorePointerOperand (I);
58435841
58445842 return TTI.getAddressComputationCost (VectorTy) +
5845- TTI.getGatherScatterOpCost (
5846- I-> getOpcode (), VectorTy, Ptr, Legal->isMaskRequired (I), Alignment,
5847- TargetTransformInfo::TCK_RecipThroughput , I);
5843+ TTI.getGatherScatterOpCost (I-> getOpcode (), VectorTy, Ptr,
5844+ Legal->isMaskRequired (I), Alignment,
5845+ CostKind , I);
58485846}
58495847
58505848InstructionCost
@@ -5857,7 +5855,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
58575855 Type *ValTy = getLoadStoreType (InsertPos);
58585856 auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
58595857 unsigned AS = getLoadStoreAddressSpace (InsertPos);
5860- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
58615858
58625859 unsigned InterleaveFactor = Group->getFactor ();
58635860 auto *WideVecTy = VectorType::get (ValTy, VF * InterleaveFactor);
@@ -5889,9 +5886,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
58895886}
58905887
58915888std::optional<InstructionCost>
5892- LoopVectorizationCostModel::getReductionPatternCost (
5893- Instruction *I, ElementCount VF, Type *Ty ,
5894- TTI::TargetCostKind CostKind ) const {
5889+ LoopVectorizationCostModel::getReductionPatternCost (Instruction *I,
5890+ ElementCount VF ,
5891+ Type *Ty ) const {
58955892 using namespace llvm ::PatternMatch;
58965893 // Early exit for no inloop reductions
58975894 if (InLoopReductions.empty () || VF.isScalar () || !isa<VectorType>(Ty))
@@ -6082,14 +6079,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
60826079
60836080 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (I->getOperand (0 ));
60846081 return TTI.getAddressComputationCost (ValTy) +
6085- TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS,
6086- TTI::TCK_RecipThroughput, OpInfo, I);
6082+ TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS, CostKind,
6083+ OpInfo, I);
60876084 }
60886085 return getWideningCost (I, VF);
60896086}
60906087
6091- InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
6092- Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6088+ InstructionCost
6089+ LoopVectorizationCostModel::getScalarizationOverhead (Instruction *I,
6090+ ElementCount VF) const {
60936091
60946092 // There is no mechanism yet to create a scalable scalarization loop,
60956093 // so this is currently Invalid.
@@ -6332,7 +6330,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63326330 InstructionCost ScalarCost = InstructionCost::getInvalid ();
63336331 InstructionCost VectorCost = InstructionCost::getInvalid ();
63346332 InstructionCost IntrinsicCost = InstructionCost::getInvalid ();
6335- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
63366333 Function *ScalarFunc = CI->getCalledFunction ();
63376334 Type *ScalarRetTy = CI->getType ();
63386335 SmallVector<Type *, 4 > Tys, ScalarTys;
@@ -6348,8 +6345,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63486345
63496346 // Compute costs of unpacking argument values for the scalar calls and
63506347 // packing the return values to a vector.
6351- InstructionCost ScalarizationCost =
6352- getScalarizationOverhead (CI, VF, CostKind);
6348+ InstructionCost ScalarizationCost = getScalarizationOverhead (CI, VF);
63536349
63546350 ScalarCost = ScalarCallCost * VF.getKnownMinValue () + ScalarizationCost;
63556351 // Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6373,7 +6369,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63736369 // An in-loop reduction using an fmuladd intrinsic is a special case;
63746370 // we don't want the normal cost for that intrinsic.
63756371 if (RecurrenceDescriptor::isFMulAddIntrinsic (CI))
6376- if (auto RedCost = getReductionPatternCost (CI, VF, RetTy, CostKind )) {
6372+ if (auto RedCost = getReductionPatternCost (CI, VF, RetTy)) {
63776373 setCallWideningDecision (CI, VF, CM_IntrinsicCall, nullptr ,
63786374 getVectorIntrinsicIDForCall (CI, TLI),
63796375 std::nullopt , *RedCost);
@@ -6458,7 +6454,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
64586454 TargetTransformInfo::SK_Broadcast,
64596455 VectorType::get (IntegerType::getInt1Ty (
64606456 VecFunc->getFunctionType ()->getContext ()),
6461- VF));
6457+ VF),
6458+ {}, CostKind);
64626459
64636460 if (TLI && VecFunc && !CI->isNoBuiltin ())
64646461 VectorCost =
@@ -6526,7 +6523,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
65266523 if (canTruncateToMinimalBitwidth (I, VF))
65276524 RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
65286525 auto *SE = PSE.getSE ();
6529- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
65306526
65316527 auto HasSingleCopyAfterVectorization = [this ](Instruction *I,
65326528 ElementCount VF) -> bool {
@@ -6702,7 +6698,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67026698 InstructionCost MulCost = TTI::TCC_Free;
67036699 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand (1 ));
67046700 if (!RHS || RHS->getZExtValue () != 1 )
6705- MulCost = TTI.getArithmeticInstrCost (Instruction::Mul, VectorTy);
6701+ MulCost =
6702+ TTI.getArithmeticInstrCost (Instruction::Mul, VectorTy, CostKind);
67066703
67076704 // Find the cost of the histogram operation itself.
67086705 Type *PtrTy = VectorType::get (HGram->Load ->getPointerOperandType (), VF);
@@ -6713,9 +6710,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67136710 {PtrTy, ScalarTy, MaskTy});
67146711
67156712 // Add the costs together with the add/sub operation.
6716- return TTI.getIntrinsicInstrCost (
6717- ICA, TargetTransformInfo::TCK_RecipThroughput) +
6718- MulCost + TTI.getArithmeticInstrCost (I->getOpcode (), VectorTy);
6713+ return TTI.getIntrinsicInstrCost (ICA, CostKind) + MulCost +
6714+ TTI.getArithmeticInstrCost (I->getOpcode (), VectorTy, CostKind);
67196715 }
67206716 [[fallthrough]];
67216717 }
@@ -6740,7 +6736,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67406736 return 0 ;
67416737
67426738 // Detect reduction patterns
6743- if (auto RedCost = getReductionPatternCost (I, VF, VectorTy, CostKind ))
6739+ if (auto RedCost = getReductionPatternCost (I, VF, VectorTy))
67446740 return *RedCost;
67456741
67466742 // Certain instructions can be cheaper to vectorize if they have a constant
@@ -6905,7 +6901,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
69056901 }
69066902
69076903 // Detect reduction patterns
6908- if (auto RedCost = getReductionPatternCost (I, VF, VectorTy, CostKind ))
6904+ if (auto RedCost = getReductionPatternCost (I, VF, VectorTy))
69096905 return *RedCost;
69106906
69116907 Type *SrcScalarTy = I->getOperand (0 )->getType ();
@@ -6930,7 +6926,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
69306926 case Instruction::Call:
69316927 return getVectorCallCost (cast<CallInst>(I), VF);
69326928 case Instruction::ExtractValue:
6933- return TTI.getInstructionCost (I, TTI::TCK_RecipThroughput );
6929+ return TTI.getInstructionCost (I, CostKind );
69346930 case Instruction::Alloca:
69356931 // We cannot easily widen alloca to a scalable alloca, as
69366932 // the result would need to be a vector of pointers.
@@ -7442,8 +7438,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
74427438
74437439 // Pre-compute the cost for I, if it has a reduction pattern cost.
74447440 for (Instruction *I : ChainOpsAndOperands) {
7445- auto ReductionCost = CM. getReductionPatternCost (
7446- I, VF, toVectorTy (I->getType (), VF), TTI::TCK_RecipThroughput );
7441+ auto ReductionCost =
7442+ CM. getReductionPatternCost ( I, VF, toVectorTy (I->getType (), VF));
74477443 if (!ReductionCost)
74487444 continue ;
74497445
@@ -7501,7 +7497,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
75017497
75027498InstructionCost LoopVectorizationPlanner::cost (VPlan &Plan,
75037499 ElementCount VF) const {
7504- VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM);
7500+ VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM,
7501+ CM.CostKind );
75057502 InstructionCost Cost = precomputeCosts (Plan, VF, CostCtx);
75067503
75077504 // Now compute and add the VPlan-based cost.
@@ -7581,6 +7578,16 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
75817578 if (VPlans.size () == 1 && size (FirstPlan.vectorFactors ()) == 1 )
75827579 return {*FirstPlan.vectorFactors ().begin (), 0 , 0 };
75837580
7581+ LLVM_DEBUG (dbgs () << " LV: Computing best VF using cost kind: "
7582+ << (CM.CostKind == TTI::TCK_RecipThroughput
7583+ ? " Reciprocal Throughput\n "
7584+ : CM.CostKind == TTI::TCK_Latency
7585+ ? " Instruction Latency\n "
7586+ : CM.CostKind == TTI::TCK_CodeSize ? " Code Size\n "
7587+ : CM.CostKind == TTI::TCK_SizeAndLatency
7588+ ? " Code Size and Latency\n "
7589+ : " Unknown\n " ));
7590+
75847591 ElementCount ScalarVF = ElementCount::getFixed (1 );
75857592 assert (hasPlanWithVF (ScalarVF) &&
75867593 " More than a single plan/VF w/o any plan having scalar VF" );
@@ -7634,7 +7641,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76347641 // simplifications not accounted for in the legacy cost model. If that's the
76357642 // case, don't trigger the assertion, as the extra simplifications may cause a
76367643 // different VF to be picked by the VPlan-based cost model.
7637- VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM);
7644+ VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM,
7645+ CM.CostKind );
76387646 precomputeCosts (BestPlan, BestFactor.Width , CostCtx);
76397647 assert ((BestFactor.Width == LegacyVF.Width ||
76407648 planContainsAdditionalSimplifications (getPlanFor (BestFactor.Width ),
@@ -10155,7 +10163,7 @@ static bool processLoopInVPlanNativePath(
1015510163 bool AddBranchWeights =
1015610164 hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
1015710165 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10158- AddBranchWeights);
10166+ AddBranchWeights, CM. CostKind );
1015910167 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
1016010168 VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan);
1016110169 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10692,7 +10700,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1069210700 bool AddBranchWeights =
1069310701 hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
1069410702 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10695- AddBranchWeights);
10703+ AddBranchWeights, CM. CostKind );
1069610704 if (LVP.hasPlanWithVF (VF.Width )) {
1069710705 // Select the interleave count.
1069810706 IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
0 commit comments