@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
987987 InterleavedAccessInfo &IAI)
988988 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989989 TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990- Hints(Hints), InterleaveInfo(IAI) {}
990+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
991991
992992 // / \return An upper bound for the vectorization factors (both fixed and
993993 // / scalable). If the factors are 0, vectorization and interleaving should be
@@ -1553,9 +1553,9 @@ class LoopVectorizationCostModel {
15531553
15541554 // / Return the cost of instructions in an inloop reduction pattern, if I is
15551555 // / part of that pattern.
1556- std::optional<InstructionCost>
1557- getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy ,
1558- TTI::TargetCostKind CostKind ) const ;
1556+ std::optional<InstructionCost> getReductionPatternCost (Instruction *I,
1557+ ElementCount VF ,
1558+ Type *VectorTy ) const ;
15591559
15601560 // / Returns true if \p Op should be considered invariant and if it is
15611561 // / trivially hoistable.
@@ -1614,8 +1614,8 @@ class LoopVectorizationCostModel {
16141614
16151615 // / Estimate the overhead of scalarizing an instruction. This is a
16161616 // / convenience wrapper for the type-based getScalarizationOverhead API.
1617- InstructionCost getScalarizationOverhead (Instruction *I, ElementCount VF,
1618- TTI::TargetCostKind CostKind ) const ;
1617+ InstructionCost getScalarizationOverhead (Instruction *I,
1618+ ElementCount VF ) const ;
16191619
16201620 // / Returns true if an artificially high cost for emulated masked memrefs
16211621 // / should be used.
@@ -1796,6 +1796,9 @@ class LoopVectorizationCostModel {
17961796
17971797 // / All element types found in the loop.
17981798 SmallPtrSet<Type *, 16 > ElementTypesInLoop;
1799+
1800+ // / The kind of cost that we are calculating
1801+ TTI::TargetCostKind CostKind;
17991802};
18001803} // end namespace llvm
18011804
@@ -1836,13 +1839,17 @@ class GeneratedRTChecks {
18361839
18371840 PredicatedScalarEvolution &PSE;
18381841
1842+ // / The kind of cost that we are calculating
1843+ TTI::TargetCostKind CostKind;
1844+
18391845public:
18401846 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
18411847 LoopInfo *LI, TargetTransformInfo *TTI,
1842- const DataLayout &DL, bool AddBranchWeights)
1848+ const DataLayout &DL, bool AddBranchWeights,
1849+ TTI::TargetCostKind CostKind)
18431850 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, " scev.check" ),
18441851 MemCheckExp (*PSE.getSE(), DL, "scev.check"),
1845- AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1852+ AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
18461853
18471854 // / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18481855 // / accurately estimate the cost of the runtime checks. The blocks are
@@ -1954,8 +1961,7 @@ class GeneratedRTChecks {
19541961 for (Instruction &I : *SCEVCheckBlock) {
19551962 if (SCEVCheckBlock->getTerminator () == &I)
19561963 continue ;
1957- InstructionCost C =
1958- TTI->getInstructionCost (&I, TTI::TCK_RecipThroughput);
1964+ InstructionCost C = TTI->getInstructionCost (&I, CostKind);
19591965 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19601966 RTCheckCost += C;
19611967 }
@@ -1964,8 +1970,7 @@ class GeneratedRTChecks {
19641970 for (Instruction &I : *MemCheckBlock) {
19651971 if (MemCheckBlock->getTerminator () == &I)
19661972 continue ;
1967- InstructionCost C =
1968- TTI->getInstructionCost (&I, TTI::TCK_RecipThroughput);
1973+ InstructionCost C = TTI->getInstructionCost (&I, CostKind);
19691974 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19701975 MemCheckCost += C;
19711976 }
@@ -2926,18 +2931,17 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
29262931 if (!VF.isScalar ())
29272932 return CallWideningDecisions.at (std::make_pair (CI, VF)).Cost ;
29282933
2929- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
29302934 Type *RetTy = CI->getType ();
29312935 if (RecurrenceDescriptor::isFMulAddIntrinsic (CI))
2932- if (auto RedCost = getReductionPatternCost (CI, VF, RetTy, CostKind ))
2936+ if (auto RedCost = getReductionPatternCost (CI, VF, RetTy))
29332937 return *RedCost;
29342938
29352939 SmallVector<Type *, 4 > Tys;
29362940 for (auto &ArgOp : CI->args ())
29372941 Tys.push_back (ArgOp->getType ());
29382942
29392943 InstructionCost ScalarCallCost =
2940- TTI.getCallInstrCost (CI->getCalledFunction (), RetTy, Tys, CostKind );
2944+ TTI.getCallInstrCost (CI->getCalledFunction (), RetTy, Tys);
29412945
29422946 // If this is an intrinsic we may have a lower cost for it.
29432947 if (getVectorIntrinsicIDForCall (CI, TLI)) {
@@ -2972,8 +2976,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29722976
29732977 IntrinsicCostAttributes CostAttrs (ID, RetTy, Arguments, ParamTys, FMF,
29742978 dyn_cast<IntrinsicInst>(CI));
2975- return TTI.getIntrinsicInstrCost (CostAttrs,
2976- TargetTransformInfo::TCK_RecipThroughput);
2979+ return TTI.getIntrinsicInstrCost (CostAttrs, CostKind);
29772980}
29782981
29792982void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
@@ -3430,8 +3433,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
34303433 I->getOpcode () == Instruction::URem);
34313434 assert (!isSafeToSpeculativelyExecute (I));
34323435
3433- const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3434-
34353436 // Scalarization isn't legal for scalable vector types
34363437 InstructionCost ScalarizationCost = InstructionCost::getInvalid ();
34373438 if (!VF.isScalable ()) {
@@ -3453,7 +3454,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
34533454
34543455 // The cost of insertelement and extractelement instructions needed for
34553456 // scalarization.
3456- ScalarizationCost += getScalarizationOverhead (I, VF, CostKind );
3457+ ScalarizationCost += getScalarizationOverhead (I, VF);
34573458
34583459 // Scale the cost by the probability of executing the predicated blocks.
34593460 // This assumes the predicated block for each vector lane is equally
@@ -4426,7 +4427,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
44264427 for (const auto &Plan : VPlans) {
44274428 for (ElementCount VF : Plan->vectorFactors ()) {
44284429 VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (),
4429- CM);
4430+ CM, CM. CostKind );
44304431 precomputeCosts (*Plan, VF, CostCtx);
44314432 auto Iter = vp_depth_first_deep (Plan->getVectorLoopRegion ()->getEntry ());
44324433 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5576,7 +5577,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55765577
55775578 // Compute the scalarization overhead of needed insertelement instructions
55785579 // and phi nodes.
5579- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
55805580 if (isScalarWithPredication (I, VF) && !I->getType ()->isVoidTy ()) {
55815581 ScalarCost += TTI.getScalarizationOverhead (
55825582 cast<VectorType>(toVectorTy (I->getType (), VF)),
@@ -5723,15 +5723,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
57235723
57245724 // Don't pass *I here, since it is scalar but will actually be part of a
57255725 // vectorized loop where the user of it is a vectorized instruction.
5726- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
57275726 const Align Alignment = getLoadStoreAlignment (I);
57285727 Cost += VF.getKnownMinValue () * TTI.getMemoryOpCost (I->getOpcode (),
57295728 ValTy->getScalarType (),
57305729 Alignment, AS, CostKind);
57315730
57325731 // Get the overhead of the extractelement and insertelement instructions
57335732 // we might create due to scalarization.
5734- Cost += getScalarizationOverhead (I, VF, CostKind );
5733+ Cost += getScalarizationOverhead (I, VF);
57355734
57365735 // If we have a predicated load/store, it will need extra i1 extracts and
57375736 // conditional branches, but may not be executed for each vector lane. Scale
@@ -5764,7 +5763,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
57645763 Value *Ptr = getLoadStorePointerOperand (I);
57655764 unsigned AS = getLoadStoreAddressSpace (I);
57665765 int ConsecutiveStride = Legal->isConsecutivePtr (ValTy, Ptr);
5767- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
57685766
57695767 assert ((ConsecutiveStride == 1 || ConsecutiveStride == -1 ) &&
57705768 " Stride should be 1 or -1 for consecutive memory access" );
@@ -5795,12 +5793,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
57955793 auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
57965794 const Align Alignment = getLoadStoreAlignment (I);
57975795 unsigned AS = getLoadStoreAddressSpace (I);
5798- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
57995796 if (isa<LoadInst>(I)) {
58005797 return TTI.getAddressComputationCost (ValTy) +
58015798 TTI.getMemoryOpCost (Instruction::Load, ValTy, Alignment, AS,
58025799 CostKind) +
5803- TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VectorTy);
5800+ TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VectorTy, {},
5801+ CostKind);
58045802 }
58055803 StoreInst *SI = cast<StoreInst>(I);
58065804
@@ -5823,9 +5821,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
58235821 const Value *Ptr = getLoadStorePointerOperand (I);
58245822
58255823 return TTI.getAddressComputationCost (VectorTy) +
5826- TTI.getGatherScatterOpCost (
5827- I-> getOpcode (), VectorTy, Ptr, Legal->isMaskRequired (I), Alignment,
5828- TargetTransformInfo::TCK_RecipThroughput , I);
5824+ TTI.getGatherScatterOpCost (I-> getOpcode (), VectorTy, Ptr,
5825+ Legal->isMaskRequired (I), Alignment,
5826+ CostKind , I);
58295827}
58305828
58315829InstructionCost
@@ -5838,7 +5836,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
58385836 Type *ValTy = getLoadStoreType (InsertPos);
58395837 auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
58405838 unsigned AS = getLoadStoreAddressSpace (InsertPos);
5841- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
58425839
58435840 unsigned InterleaveFactor = Group->getFactor ();
58445841 auto *WideVecTy = VectorType::get (ValTy, VF * InterleaveFactor);
@@ -5870,9 +5867,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
58705867}
58715868
58725869std::optional<InstructionCost>
5873- LoopVectorizationCostModel::getReductionPatternCost (
5874- Instruction *I, ElementCount VF, Type *Ty ,
5875- TTI::TargetCostKind CostKind ) const {
5870+ LoopVectorizationCostModel::getReductionPatternCost (Instruction *I,
5871+ ElementCount VF ,
5872+ Type *Ty ) const {
58765873 using namespace llvm ::PatternMatch;
58775874 // Early exit for no inloop reductions
58785875 if (InLoopReductions.empty () || VF.isScalar () || !isa<VectorType>(Ty))
@@ -6063,14 +6060,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
60636060
60646061 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (I->getOperand (0 ));
60656062 return TTI.getAddressComputationCost (ValTy) +
6066- TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS,
6067- TTI::TCK_RecipThroughput, OpInfo, I);
6063+ TTI.getMemoryOpCost (I->getOpcode (), ValTy, Alignment, AS, CostKind,
6064+ OpInfo, I);
60686065 }
60696066 return getWideningCost (I, VF);
60706067}
60716068
6072- InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
6073- Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6069+ InstructionCost
6070+ LoopVectorizationCostModel::getScalarizationOverhead (Instruction *I,
6071+ ElementCount VF) const {
60746072
60756073 // There is no mechanism yet to create a scalable scalarization loop,
60766074 // so this is currently Invalid.
@@ -6313,7 +6311,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63136311 InstructionCost ScalarCost = InstructionCost::getInvalid ();
63146312 InstructionCost VectorCost = InstructionCost::getInvalid ();
63156313 InstructionCost IntrinsicCost = InstructionCost::getInvalid ();
6316- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
63176314 Function *ScalarFunc = CI->getCalledFunction ();
63186315 Type *ScalarRetTy = CI->getType ();
63196316 SmallVector<Type *, 4 > Tys, ScalarTys;
@@ -6329,8 +6326,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63296326
63306327 // Compute costs of unpacking argument values for the scalar calls and
63316328 // packing the return values to a vector.
6332- InstructionCost ScalarizationCost =
6333- getScalarizationOverhead (CI, VF, CostKind);
6329+ InstructionCost ScalarizationCost = getScalarizationOverhead (CI, VF);
63346330
63356331 ScalarCost = ScalarCallCost * VF.getKnownMinValue () + ScalarizationCost;
63366332 // Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6354,7 +6350,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63546350 // An in-loop reduction using an fmuladd intrinsic is a special case;
63556351 // we don't want the normal cost for that intrinsic.
63566352 if (RecurrenceDescriptor::isFMulAddIntrinsic (CI))
6357- if (auto RedCost = getReductionPatternCost (CI, VF, RetTy, CostKind )) {
6353+ if (auto RedCost = getReductionPatternCost (CI, VF, RetTy)) {
63586354 setCallWideningDecision (CI, VF, CM_IntrinsicCall, nullptr ,
63596355 getVectorIntrinsicIDForCall (CI, TLI),
63606356 std::nullopt , *RedCost);
@@ -6439,7 +6435,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
64396435 TargetTransformInfo::SK_Broadcast,
64406436 VectorType::get (IntegerType::getInt1Ty (
64416437 VecFunc->getFunctionType ()->getContext ()),
6442- VF));
6438+ VF),
6439+ {}, CostKind);
64436440
64446441 if (TLI && VecFunc && !CI->isNoBuiltin ())
64456442 VectorCost =
@@ -6507,7 +6504,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
65076504 if (canTruncateToMinimalBitwidth (I, VF))
65086505 RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
65096506 auto *SE = PSE.getSE ();
6510- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
65116507
65126508 auto HasSingleCopyAfterVectorization = [this ](Instruction *I,
65136509 ElementCount VF) -> bool {
@@ -6694,9 +6690,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
66946690 {PtrTy, ScalarTy, MaskTy});
66956691
66966692 // Add the costs together with the add/sub operation.
6697- return TTI.getIntrinsicInstrCost (
6698- ICA, TargetTransformInfo::TCK_RecipThroughput) +
6699- MulCost + TTI.getArithmeticInstrCost (I->getOpcode (), VectorTy);
6693+ return TTI.getIntrinsicInstrCost (ICA, CostKind) + MulCost +
6694+ TTI.getArithmeticInstrCost (I->getOpcode (), VectorTy);
67006695 }
67016696 [[fallthrough]];
67026697 }
@@ -6721,7 +6716,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
67216716 return 0 ;
67226717
67236718 // Detect reduction patterns
6724- if (auto RedCost = getReductionPatternCost (I, VF, VectorTy, CostKind ))
6719+ if (auto RedCost = getReductionPatternCost (I, VF, VectorTy))
67256720 return *RedCost;
67266721
67276722 // Certain instructions can be cheaper to vectorize if they have a constant
@@ -6886,7 +6881,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
68866881 }
68876882
68886883 // Detect reduction patterns
6889- if (auto RedCost = getReductionPatternCost (I, VF, VectorTy, CostKind ))
6884+ if (auto RedCost = getReductionPatternCost (I, VF, VectorTy))
68906885 return *RedCost;
68916886
68926887 Type *SrcScalarTy = I->getOperand (0 )->getType ();
@@ -6911,7 +6906,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
69116906 case Instruction::Call:
69126907 return getVectorCallCost (cast<CallInst>(I), VF);
69136908 case Instruction::ExtractValue:
6914- return TTI.getInstructionCost (I, TTI::TCK_RecipThroughput );
6909+ return TTI.getInstructionCost (I, CostKind );
69156910 case Instruction::Alloca:
69166911 // We cannot easily widen alloca to a scalable alloca, as
69176912 // the result would need to be a vector of pointers.
@@ -7423,8 +7418,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
74237418
74247419 // Pre-compute the cost for I, if it has a reduction pattern cost.
74257420 for (Instruction *I : ChainOpsAndOperands) {
7426- auto ReductionCost = CM. getReductionPatternCost (
7427- I, VF, toVectorTy (I->getType (), VF), TTI::TCK_RecipThroughput );
7421+ auto ReductionCost =
7422+ CM. getReductionPatternCost ( I, VF, toVectorTy (I->getType (), VF));
74287423 if (!ReductionCost)
74297424 continue ;
74307425
@@ -7482,7 +7477,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
74827477
74837478InstructionCost LoopVectorizationPlanner::cost (VPlan &Plan,
74847479 ElementCount VF) const {
7485- VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM);
7480+ VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM,
7481+ CM.CostKind );
74867482 InstructionCost Cost = precomputeCosts (Plan, VF, CostCtx);
74877483
74887484 // Now compute and add the VPlan-based cost.
@@ -7611,7 +7607,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
76117607 // simplifications not accounted for in the legacy cost model. If that's the
76127608 // case, don't trigger the assertion, as the extra simplifications may cause a
76137609 // different VF to be picked by the VPlan-based cost model.
7614- VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM);
7610+ VPCostContext CostCtx (CM.TTI , *CM.TLI , Legal->getWidestInductionType (), CM,
7611+ CM.CostKind );
76157612 precomputeCosts (BestPlan, BestFactor.Width , CostCtx);
76167613 assert ((BestFactor.Width == LegacyVF.Width ||
76177614 planContainsAdditionalSimplifications (getPlanFor (BestFactor.Width ),
@@ -9971,7 +9968,7 @@ static bool processLoopInVPlanNativePath(
99719968 bool AddBranchWeights =
99729969 hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
99739970 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
9974- AddBranchWeights);
9971+ AddBranchWeights, CM. CostKind );
99759972 InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
99769973 VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan);
99779974 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10488,7 +10485,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1048810485 bool AddBranchWeights =
1048910486 hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
1049010487 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
10491- AddBranchWeights);
10488+ AddBranchWeights, CM. CostKind );
1049210489 if (LVP.hasPlanWithVF (VF.Width )) {
1049310490 // Select the interleave count.
1049410491 IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
0 commit comments