diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e0f629e14f657..397d53b1c4b4c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -987,7 +987,7 @@ class LoopVectorizationCostModel { InterleavedAccessInfo &IAI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) {} + Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {} /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be @@ -1553,9 +1553,9 @@ class LoopVectorizationCostModel { /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. - std::optional - getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind) const; + std::optional getReductionPatternCost(Instruction *I, + ElementCount VF, + Type *VectorTy) const; /// Returns true if \p Op should be considered invariant and if it is /// trivially hoistable. @@ -1614,8 +1614,8 @@ class LoopVectorizationCostModel { /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, - TTI::TargetCostKind CostKind) const; + InstructionCost getScalarizationOverhead(Instruction *I, + ElementCount VF) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. @@ -1796,6 +1796,9 @@ class LoopVectorizationCostModel { /// All element types found in the loop. SmallPtrSet ElementTypesInLoop; + + /// The kind of cost that we are calculating + TTI::TargetCostKind CostKind; }; } // end namespace llvm @@ -1836,13 +1839,17 @@ class GeneratedRTChecks { PredicatedScalarEvolution &PSE; + /// The kind of cost that we are calculating + TTI::TargetCostKind CostKind; + public: GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, LoopInfo *LI, TargetTransformInfo *TTI, - const DataLayout &DL, bool AddBranchWeights) + const DataLayout &DL, bool AddBranchWeights, + TTI::TargetCostKind CostKind) : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), MemCheckExp(*PSE.getSE(), DL, "scev.check"), - AddBranchWeights(AddBranchWeights), PSE(PSE) {} + AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1954,8 +1961,7 @@ class GeneratedRTChecks { for (Instruction &I : *SCEVCheckBlock) { if (SCEVCheckBlock->getTerminator() == &I) continue; - InstructionCost C = - TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + InstructionCost C = TTI->getInstructionCost(&I, CostKind); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); RTCheckCost += C; } @@ -1964,8 +1970,7 @@ class GeneratedRTChecks { for (Instruction &I : *MemCheckBlock) { if (MemCheckBlock->getTerminator() == &I) continue; - InstructionCost C = - TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + InstructionCost C = TTI->getInstructionCost(&I, CostKind); LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); MemCheckCost += C; } @@ -2926,10 +2931,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, if (!VF.isScalar()) return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *RetTy = CI->getType(); if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) - if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) return *RedCost; SmallVector Tys; @@ -2972,8 +2976,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, dyn_cast(CI)); - return TTI.getIntrinsicInstrCost(CostAttrs, - TargetTransformInfo::TCK_RecipThroughput); + return TTI.getIntrinsicInstrCost(CostAttrs, CostKind); } void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { @@ -3430,8 +3433,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, I->getOpcode() == Instruction::URem); assert(!isSafeToSpeculativelyExecute(I)); - const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // Scalarization isn't legal for scalable vector types InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (!VF.isScalable()) { @@ -3453,7 +3454,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // The cost of insertelement and extractelement instructions needed for // scalarization. - ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); + ScalarizationCost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -4426,7 +4427,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( for (const auto &Plan : VPlans) { for (ElementCount VF : Plan->vectorFactors()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), - CM); + CM, CM.CostKind); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -5576,7 +5577,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(toVectorTy(I->getType(), VF)), @@ -5723,7 +5723,6 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; const Align Alignment = getLoadStoreAlignment(I); Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), @@ -5731,7 +5730,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, CostKind); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale @@ -5764,7 +5763,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); - enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); @@ -5795,12 +5793,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, auto *VectorTy = cast(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); - enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isa(I)) { return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {}, + CostKind); } StoreInst *SI = cast(I); @@ -5823,9 +5821,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, const Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + - TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, - TargetTransformInfo::TCK_RecipThroughput, I); + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment, + CostKind, I); } InstructionCost @@ -5838,7 +5836,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Type *ValTy = getLoadStoreType(InsertPos); auto *VectorTy = cast(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); - enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned InterleaveFactor = Group->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); @@ -5870,9 +5867,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } std::optional -LoopVectorizationCostModel::getReductionPatternCost( - Instruction *I, ElementCount VF, Type *Ty, - TTI::TargetCostKind CostKind) const { +LoopVectorizationCostModel::getReductionPatternCost(Instruction *I, + ElementCount VF, + Type *Ty) const { using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductions.empty() || VF.isScalar() || !isa(Ty)) @@ -6063,14 +6060,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, - TTI::TCK_RecipThroughput, OpInfo, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind, + OpInfo, I); } return getWideningCost(I, VF); } -InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( - Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { +InstructionCost +LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + ElementCount VF) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6313,7 +6311,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { InstructionCost ScalarCost = InstructionCost::getInvalid(); InstructionCost VectorCost = InstructionCost::getInvalid(); InstructionCost IntrinsicCost = InstructionCost::getInvalid(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Function *ScalarFunc = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -6329,8 +6326,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = - getScalarizationOverhead(CI, VF, CostKind); + InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // Honor ForcedScalars and UniformAfterVectorization decisions. @@ -6354,7 +6350,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) - if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) { setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, getVectorIntrinsicIDForCall(CI, TLI), std::nullopt, *RedCost); @@ -6439,7 +6435,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { TargetTransformInfo::SK_Broadcast, VectorType::get(IntegerType::getInt1Ty( VecFunc->getFunctionType()->getContext()), - VF)); + VF), + {}, CostKind); if (TLI && VecFunc && !CI->isNoBuiltin()) VectorCost = @@ -6507,7 +6504,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); auto *SE = PSE.getSE(); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto HasSingleCopyAfterVectorization = [this](Instruction *I, ElementCount VF) -> bool { @@ -6683,7 +6679,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, InstructionCost MulCost = TTI::TCC_Free; ConstantInt *RHS = dyn_cast(I->getOperand(1)); if (!RHS || RHS->getZExtValue() != 1) - MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); + MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); // Find the cost of the histogram operation itself. Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF); @@ -6694,9 +6691,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, {PtrTy, ScalarTy, MaskTy}); // Add the costs together with the add/sub operation. - return TTI.getIntrinsicInstrCost( - ICA, TargetTransformInfo::TCK_RecipThroughput) + - MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy); + return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost + + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind); } [[fallthrough]]; } @@ -6721,7 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return 0; // Detect reduction patterns - if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) return *RedCost; // Certain instructions can be cheaper to vectorize if they have a constant @@ -6886,7 +6882,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, } // Detect reduction patterns - if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) + if (auto RedCost = getReductionPatternCost(I, VF, VectorTy)) return *RedCost; Type *SrcScalarTy = I->getOperand(0)->getType(); @@ -6911,7 +6907,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Call: return getVectorCallCost(cast(I), VF); case Instruction::ExtractValue: - return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); + return TTI.getInstructionCost(I, CostKind); case Instruction::Alloca: // We cannot easily widen alloca to a scalable alloca, as // the result would need to be a vector of pointers. @@ -7423,8 +7419,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, // Pre-compute the cost for I, if it has a reduction pattern cost. for (Instruction *I : ChainOpsAndOperands) { - auto ReductionCost = CM.getReductionPatternCost( - I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + auto ReductionCost = + CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF)); if (!ReductionCost) continue; @@ -7482,7 +7478,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7558,6 +7555,16 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) return {*FirstPlan.vectorFactors().begin(), 0, 0}; + LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: " + << (CM.CostKind == TTI::TCK_RecipThroughput + ? "Reciprocal Throughput\n" + : CM.CostKind == TTI::TCK_Latency + ? "Instruction Latency\n" + : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n" + : CM.CostKind == TTI::TCK_SizeAndLatency + ? "Code Size and Latency\n" + : "Unknown\n")); + ElementCount ScalarVF = ElementCount::getFixed(1); assert(hasPlanWithVF(ScalarVF) && "More than a single plan/VF w/o any plan having scalar VF"); @@ -7611,7 +7618,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); assert((BestFactor.Width == LegacyVF.Width || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), @@ -9971,7 +9979,7 @@ static bool processLoopInVPlanNativePath( bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), - AddBranchWeights); + AddBranchWeights, CM.CostKind); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -10488,7 +10496,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), - AddBranchWeights); + AddBranchWeights, CM.CostKind); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index e804f81c36dba..e0c9f6d27cb88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -770,7 +770,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { InstructionCost BackedgeCost = ForceTargetInstructionCost.getNumOccurrences() ? InstructionCost(ForceTargetInstructionCost.getNumOccurrences()) - : Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + : Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind); LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF << ": vector loop backedge\n"); Cost += BackedgeCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9d7bf97d305ed..25f889028cb39 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -686,11 +686,13 @@ struct VPCostContext { LLVMContext &LLVMCtx; LoopVectorizationCostModel &CM; SmallPtrSet SkipCostComputation; + TargetTransformInfo::TargetCostKind CostKind; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, - Type *CanIVTy, LoopVectorizationCostModel &CM) + Type *CanIVTy, LoopVectorizationCostModel &CM, + TargetTransformInfo::TargetCostKind CostKind) : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), - CM(CM) {} + CM(CM), CostKind(CostKind) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 76336ae447edb..ae0785ad9a67a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -920,10 +920,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(), Variant->getFunctionType()->params(), - CostKind); + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1001,8 +1000,6 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise @@ -1042,7 +1039,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, IntrinsicCostAttributes CostAttrs( VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, dyn_cast_or_null(getUnderlyingValue())); - return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind); + return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { @@ -1125,7 +1122,7 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, // Assume that a non-constant update value (or a constant != 1) requires // a multiply, and add that into the cost. InstructionCost MulCost = - Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy); + Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind); if (IncAmt->isLiveIn()) { ConstantInt *CI = dyn_cast(IncAmt->getLiveInIRValue()); @@ -1141,9 +1138,8 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF, {PtrTy, IncTy, MaskTy}); // Add the costs together with the add/sub operation. - return Ctx.TTI.getIntrinsicInstrCost( - ICA, TargetTransformInfo::TCK_RecipThroughput) + - MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy); + return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost + + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1204,7 +1200,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions(); Type *ScalarTy = Ctx.Types.inferScalarType(this); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; VPValue *Op0, *Op1; using namespace llvm::VPlanPatternMatch; @@ -1222,8 +1217,8 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, Operands.append(SI->op_begin(), SI->op_end()); bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))); return Ctx.TTI.getArithmeticInstrCost( - IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind, - {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); + IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, + Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI); } Type *CondTy = Ctx.Types.inferScalarType(getOperand(0)); @@ -1233,9 +1228,9 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; if (auto *Cmp = dyn_cast(SI->getCondition())) Pred = Cmp->getPredicate(); - return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred, - CostKind, {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_AnyValue, TTI::OP_None}, SI); + return Ctx.TTI.getCmpSelInstrCost( + Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); } VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( @@ -1380,12 +1375,11 @@ void VPWidenRecipe::execute(VPTransformState &State) { InstructionCost VPWidenRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; switch (Opcode) { case Instruction::FNeg: { Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticInstrCost( - Opcode, VectorTy, CostKind, + Opcode, VectorTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); } @@ -1428,21 +1422,22 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, if (CtxI) Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); return Ctx.TTI.getArithmeticInstrCost( - Opcode, VectorTy, CostKind, + Opcode, VectorTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, RHSInfo, Operands, CtxI, &Ctx.TLI); } case Instruction::Freeze: { // This opcode is unknown. Assume that it is the same as 'mul'. Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, + Ctx.CostKind); } case Instruction::ICmp: case Instruction::FCmp: { Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), - CostKind, + Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, CtxI); } @@ -1572,7 +1567,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto *DestTy = cast(toVectorTy(getResultType(), VF)); // Arm TTI will use the underlying instruction to determine the cost. return Ctx.TTI.getCastInstrCost( - Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput, + Opcode, DestTy, SrcTy, CCH, Ctx.CostKind, dyn_cast_if_present(getUnderlyingValue())); } @@ -1590,7 +1585,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); } /// This function adds @@ -2069,18 +2064,16 @@ void VPBlendRecipe::execute(VPTransformState &State) { InstructionCost VPBlendRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // Handle cases where only the first lane is used the same way as the legacy // cost model. if (vputils::onlyFirstLaneUsed(this)) - return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF); return (getNumIncomingValues() - 1) * Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2200,7 +2193,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, RecurKind RdxKind = RdxDesc.getRecurrenceKind(); Type *ElementTy = Ctx.Types.inferScalarType(this); auto *VectorTy = cast(toVectorTy(ElementTy, VF)); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned Opcode = RdxDesc.getOpcode(); // TODO: Support any-of and in-loop reductions. @@ -2218,15 +2210,15 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, // Cost = Reduction cost + BinOp cost InstructionCost Cost = - Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind); + Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); return Cost + Ctx.TTI.getMinMaxReductionCost( - Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind); + Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); } return Cost + Ctx.TTI.getArithmeticReductionCost( - Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind); + Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2457,7 +2449,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -2468,25 +2459,26 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, "Inconsecutive memory access should not have the order."); return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr, - IsMasked, Alignment, CostKind, + IsMasked, Alignment, Ctx.CostKind, &Ingredient); } InstructionCost Cost = 0; if (IsMasked) { Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, - AS, CostKind); + AS, Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.TTI.getOperandInfo(Ingredient.getOperand(0)); Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS, - CostKind, OpInfo, &Ingredient); + Ctx.CostKind, OpInfo, &Ingredient); } if (!Reverse) return Cost; - return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, CostKind, 0); + return Cost += + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, + cast(Ty), {}, Ctx.CostKind, 0); } void VPWidenLoadRecipe::execute(VPTransformState &State) { @@ -2604,14 +2596,14 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); + Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) return Cost; return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, CostKind, 0); + cast(Ty), {}, Ctx.CostKind, + 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2725,14 +2717,14 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Ingredient.getOpcode(), Ty, Alignment, AS, CostKind); + Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) return Cost; return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, CostKind, 0); + cast(Ty), {}, Ctx.CostKind, + 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3090,7 +3082,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, : getStoredValues()[InsertPosIdx]); auto *VectorTy = cast(toVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(InsertPos); - enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; unsigned InterleaveFactor = IG->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); @@ -3104,14 +3095,15 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, // Calculate the cost of the whole interleaved group. InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost( InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices, - IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps); + IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps); if (!IG->isReverse()) return Cost; return Cost + IG->getNumMembers() * Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - VectorTy, std::nullopt, CostKind, 0); + VectorTy, std::nullopt, Ctx.CostKind, + 0); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3321,9 +3313,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { InstructionCost VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (VF.isScalar()) - return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind); + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); if (VF.isScalable() && VF.getKnownMinValue() == 1) return InstructionCost::getInvalid(); @@ -3334,7 +3325,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice, - cast(VectorTy), Mask, CostKind, + cast(VectorTy), Mask, Ctx.CostKind, VF.getKnownMinValue() - 1); }