@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel {
10901090 bool selectUserVectorizationFactor (ElementCount UserVF) {
10911091 collectUniformsAndScalars (UserVF);
10921092 collectInstsToScalarize (UserVF);
1093- return expectedCost (UserVF).first . isValid ();
1093+ return expectedCost (UserVF).isValid ();
10941094 }
10951095
10961096 // / \return The size (in bits) of the smallest and widest types in the code
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel {
15911591 Scalars.clear ();
15921592 }
15931593
1594- // / The vectorization cost is a combination of the cost itself and a boolean
1595- // / indicating whether any of the contributing operations will actually
1596- // / operate on vector values after type legalization in the backend. If this
1597- // / latter value is false, then all operations will be scalarized (i.e. no
1598- // / vectorization has actually taken place).
1599- using VectorizationCostTy = std::pair<InstructionCost, bool >;
1600-
16011594 // / Returns the expected execution cost. The unit of the cost does
16021595 // / not matter because we use the 'cost' units to compare different
16031596 // / vector widths. The cost that is returned is *not* normalized by
16041597 // / the factor width. If \p Invalid is not nullptr, this function
16051598 // / will add a pair(Instruction*, ElementCount) to \p Invalid for
16061599 // / each instruction that has an Invalid cost for the given VF.
1607- VectorizationCostTy
1600+ InstructionCost
16081601 expectedCost (ElementCount VF,
16091602 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr );
16101603
@@ -1642,12 +1635,7 @@ class LoopVectorizationCostModel {
16421635
16431636 // / Returns the execution time cost of an instruction for a given vector
16441637 // / width. Vector width of one means scalar.
1645- VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1646-
1647- // / The cost-computation logic from getInstructionCost which provides
1648- // / the vector type as an output parameter.
1649- InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
1650- Type *&VectorTy);
1638+ InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
16511639
16521640 // / Return the cost of instructions in an inloop reduction pattern, if I is
16531641 // / part of that pattern.
@@ -4795,9 +4783,101 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
47954783 } while (!Tail.empty ());
47964784}
47974785
4786+ // / Check if any recipe of \p Plan will generate a vector value, which will be
4787+ // / assigned a vector register.
4788+ static bool willGenerateVectors (VPlan &Plan, ElementCount VF,
4789+ const TargetTransformInfo &TTI) {
4790+ assert (VF.isVector () && " Checking a scalar VF?" );
4791+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType (),
4792+ Plan.getCanonicalIV ()->getScalarType ()->getContext ());
4793+ // Set of already visited types.
4794+ DenseSet<Type *> Visited;
4795+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4796+ vp_depth_first_shallow (Plan.getVectorLoopRegion ()->getEntry ()))) {
4797+ for (VPRecipeBase &R : *VPBB) {
4798+ // Continue early if the recipe is considered to not produce a vector
4799+ // result. Note that this includes VPInstruction where some opcodes may
4800+ // produce a vector, to preserve existing behavior as VPInstructions model
4801+ // aspects not directly mapped to existing IR instructions.
4802+ switch (R.getVPDefID ()) {
4803+ case VPDef::VPDerivedIVSC:
4804+ case VPDef::VPScalarIVStepsSC:
4805+ case VPDef::VPScalarCastSC:
4806+ case VPDef::VPReplicateSC:
4807+ case VPDef::VPInstructionSC:
4808+ case VPDef::VPCanonicalIVPHISC:
4809+ case VPDef::VPVectorPointerSC:
4810+ case VPDef::VPExpandSCEVSC:
4811+ case VPDef::VPEVLBasedIVPHISC:
4812+ case VPDef::VPPredInstPHISC:
4813+ case VPDef::VPBranchOnMaskSC:
4814+ continue ;
4815+ case VPDef::VPReductionSC:
4816+ case VPDef::VPActiveLaneMaskPHISC:
4817+ case VPDef::VPWidenCallSC:
4818+ case VPDef::VPWidenCanonicalIVSC:
4819+ case VPDef::VPWidenCastSC:
4820+ case VPDef::VPWidenGEPSC:
4821+ case VPDef::VPWidenSC:
4822+ case VPDef::VPWidenSelectSC:
4823+ case VPDef::VPBlendSC:
4824+ case VPDef::VPFirstOrderRecurrencePHISC:
4825+ case VPDef::VPWidenPHISC:
4826+ case VPDef::VPWidenIntOrFpInductionSC:
4827+ case VPDef::VPWidenPointerInductionSC:
4828+ case VPDef::VPReductionPHISC:
4829+ case VPDef::VPInterleaveSC:
4830+ case VPDef::VPWidenLoadEVLSC:
4831+ case VPDef::VPWidenLoadSC:
4832+ case VPDef::VPWidenStoreEVLSC:
4833+ case VPDef::VPWidenStoreSC:
4834+ break ;
4835+ default :
4836+ llvm_unreachable (" unhandled recipe" );
4837+ }
4838+
4839+ auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4840+ Type *VectorTy = ToVectorTy (ScalarTy, VF);
4841+ unsigned NumLegalParts = TTI.getNumberOfParts (VectorTy);
4842+ if (!NumLegalParts)
4843+ return false ;
4844+ if (VF.isScalable ()) {
4845+ // <vscale x 1 x iN> is assumed to be profitable over iN because
4846+ // scalable registers are a distinct register class from scalar
4847+ // ones. If we ever find a target which wants to lower scalable
4848+ // vectors back to scalars, we'll need to update this code to
4849+ // explicitly ask TTI about the register class uses for each part.
4850+ return NumLegalParts <= VF.getKnownMinValue ();
4851+ }
4852+ // Two or more parts that share a register - are vectorized.
4853+ return NumLegalParts < VF.getKnownMinValue ();
4854+ };
4855+
4856+ // If no def nor is a store, e.g., branches, continue - no value to check.
4857+ if (R.getNumDefinedValues () == 0 &&
4858+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4859+ &R))
4860+ continue ;
4861+ // For multi-def recipes, currently only interleaved loads, suffice to
4862+ // check first def only.
4863+ // For stores check their stored value; for interleaved stores suffice
4864+ // the check first stored value only. In all cases this is the second
4865+ // operand.
4866+ VPValue *ToCheck =
4867+ R.getNumDefinedValues () >= 1 ? R.getVPValue (0 ) : R.getOperand (1 );
4868+ Type *ScalarTy = TypeInfo.inferScalarType (ToCheck);
4869+ if (!Visited.insert ({ScalarTy}).second )
4870+ continue ;
4871+ if (WillWiden (ScalarTy))
4872+ return true ;
4873+ }
4874+ }
4875+
4876+ return false ;
4877+ }
4878+
47984879VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
4799- InstructionCost ExpectedCost =
4800- CM.expectedCost (ElementCount::getFixed (1 )).first ;
4880+ InstructionCost ExpectedCost = CM.expectedCost (ElementCount::getFixed (1 ));
48014881 LLVM_DEBUG (dbgs () << " LV: Scalar loop costs: " << ExpectedCost << " .\n " );
48024882 assert (ExpectedCost.isValid () && " Unexpected invalid cost for scalar loop" );
48034883 assert (any_of (VPlans,
@@ -4826,9 +4906,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48264906 if (VF.isScalar ())
48274907 continue ;
48284908
4829- LoopVectorizationCostModel::VectorizationCostTy C =
4830- CM.expectedCost (VF, &InvalidCosts);
4831- VectorizationFactor Candidate (VF, C.first , ScalarCost.ScalarCost );
4909+ InstructionCost C = CM.expectedCost (VF, &InvalidCosts);
4910+ VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
48324911
48334912#ifndef NDEBUG
48344913 unsigned AssumedMinimumVscale =
@@ -4845,7 +4924,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48454924 LLVM_DEBUG (dbgs () << " .\n " );
48464925#endif
48474926
4848- if (!C. second && !ForceVectorization ) {
4927+ if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI) ) {
48494928 LLVM_DEBUG (
48504929 dbgs ()
48514930 << " LV: Not considering vector loop of width " << VF
@@ -5146,7 +5225,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
51465225 // If we did not calculate the cost for VF (because the user selected the VF)
51475226 // then we calculate the cost of VF here.
51485227 if (LoopCost == 0 ) {
5149- LoopCost = expectedCost (VF). first ;
5228+ LoopCost = expectedCost (VF);
51505229 assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
51515230
51525231 // Loop body is free and there is no need for interleaving.
@@ -5717,15 +5796,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
57175796
57185797 // Compute the cost of the vector instruction. Note that this cost already
57195798 // includes the scalarization overhead of the predicated instruction.
5720- InstructionCost VectorCost = getInstructionCost (I, VF). first ;
5799+ InstructionCost VectorCost = getInstructionCost (I, VF);
57215800
57225801 // Compute the cost of the scalarized instruction. This cost is the cost of
57235802 // the instruction as if it wasn't if-converted and instead remained in the
57245803 // predicated block. We will scale this cost by block probability after
57255804 // computing the scalarization overhead.
57265805 InstructionCost ScalarCost =
5727- VF.getFixedValue () *
5728- getInstructionCost (I, ElementCount::getFixed (1 )).first ;
5806+ VF.getFixedValue () * getInstructionCost (I, ElementCount::getFixed (1 ));
57295807
57305808 // Compute the scalarization overhead of needed insertelement instructions
57315809 // and phi nodes.
@@ -5769,14 +5847,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
57695847 return Discount;
57705848}
57715849
5772- LoopVectorizationCostModel::VectorizationCostTy
5773- LoopVectorizationCostModel::expectedCost (
5850+ InstructionCost LoopVectorizationCostModel::expectedCost (
57745851 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5775- VectorizationCostTy Cost;
5852+ InstructionCost Cost;
57765853
57775854 // For each block.
57785855 for (BasicBlock *BB : TheLoop->blocks ()) {
5779- VectorizationCostTy BlockCost;
5856+ InstructionCost BlockCost;
57805857
57815858 // For each instruction in the old loop.
57825859 for (Instruction &I : BB->instructionsWithoutDebug ()) {
@@ -5785,22 +5862,19 @@ LoopVectorizationCostModel::expectedCost(
57855862 (VF.isVector () && VecValuesToIgnore.count (&I)))
57865863 continue ;
57875864
5788- VectorizationCostTy C = getInstructionCost (&I, VF);
5865+ InstructionCost C = getInstructionCost (&I, VF);
57895866
57905867 // Check if we should override the cost.
5791- if (C.first .isValid () &&
5792- ForceTargetInstructionCost.getNumOccurrences () > 0 )
5793- C.first = InstructionCost (ForceTargetInstructionCost);
5868+ if (C.isValid () && ForceTargetInstructionCost.getNumOccurrences () > 0 )
5869+ C = InstructionCost (ForceTargetInstructionCost);
57945870
57955871 // Keep a list of instructions with invalid costs.
5796- if (Invalid && !C.first . isValid ())
5872+ if (Invalid && !C.isValid ())
57975873 Invalid->emplace_back (&I, VF);
57985874
5799- BlockCost.first += C.first ;
5800- BlockCost.second |= C.second ;
5801- LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C.first
5802- << " for VF " << VF << " For instruction: " << I
5803- << ' \n ' );
5875+ BlockCost += C;
5876+ LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C << " for VF "
5877+ << VF << " For instruction: " << I << ' \n ' );
58045878 }
58055879
58065880 // If we are vectorizing a predicated block, it will have been
@@ -5811,10 +5885,9 @@ LoopVectorizationCostModel::expectedCost(
58115885 // cost by the probability of executing it. blockNeedsPredication from
58125886 // Legal is used so as to not include all blocks in tail folded loops.
58135887 if (VF.isScalar () && Legal->blockNeedsPredication (BB))
5814- BlockCost. first /= getReciprocalPredBlockProb ();
5888+ BlockCost /= getReciprocalPredBlockProb ();
58155889
5816- Cost.first += BlockCost.first ;
5817- Cost.second |= BlockCost.second ;
5890+ Cost += BlockCost;
58185891 }
58195892
58205893 return Cost;
@@ -6213,49 +6286,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
62136286 return getWideningCost (I, VF);
62146287}
62156288
6216- LoopVectorizationCostModel::VectorizationCostTy
6217- LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6218- ElementCount VF) {
6219- // If we know that this instruction will remain uniform, check the cost of
6220- // the scalar version.
6221- if (isUniformAfterVectorization (I, VF))
6222- VF = ElementCount::getFixed (1 );
6223-
6224- if (VF.isVector () && isProfitableToScalarize (I, VF))
6225- return VectorizationCostTy (InstsToScalarize[VF][I], false );
6226-
6227- // Forced scalars do not have any scalarization overhead.
6228- auto ForcedScalar = ForcedScalars.find (VF);
6229- if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6230- auto InstSet = ForcedScalar->second ;
6231- if (InstSet.count (I))
6232- return VectorizationCostTy (
6233- (getInstructionCost (I, ElementCount::getFixed (1 )).first *
6234- VF.getKnownMinValue ()),
6235- false );
6236- }
6237-
6238- Type *VectorTy;
6239- InstructionCost C = getInstructionCost (I, VF, VectorTy);
6240-
6241- bool TypeNotScalarized = false ;
6242- if (VF.isVector () && VectorTy->isVectorTy ()) {
6243- if (unsigned NumParts = TTI.getNumberOfParts (VectorTy)) {
6244- if (VF.isScalable ())
6245- // <vscale x 1 x iN> is assumed to be profitable over iN because
6246- // scalable registers are a distinct register class from scalar ones.
6247- // If we ever find a target which wants to lower scalable vectors
6248- // back to scalars, we'll need to update this code to explicitly
6249- // ask TTI about the register class uses for each part.
6250- TypeNotScalarized = NumParts <= VF.getKnownMinValue ();
6251- else
6252- TypeNotScalarized = NumParts < VF.getKnownMinValue ();
6253- } else
6254- C = InstructionCost::getInvalid ();
6255- }
6256- return VectorizationCostTy (C, TypeNotScalarized);
6257- }
6258-
62596289InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
62606290 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
62616291
@@ -6646,8 +6676,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
66466676}
66476677
66486678InstructionCost
6649- LoopVectorizationCostModel::getInstructionCost (Instruction *I, ElementCount VF,
6650- Type *&VectorTy) {
6679+ LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6680+ ElementCount VF) {
6681+ // If we know that this instruction will remain uniform, check the cost of
6682+ // the scalar version.
6683+ if (isUniformAfterVectorization (I, VF))
6684+ VF = ElementCount::getFixed (1 );
6685+
6686+ if (VF.isVector () && isProfitableToScalarize (I, VF))
6687+ return InstsToScalarize[VF][I];
6688+
6689+ // Forced scalars do not have any scalarization overhead.
6690+ auto ForcedScalar = ForcedScalars.find (VF);
6691+ if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6692+ auto InstSet = ForcedScalar->second ;
6693+ if (InstSet.count (I))
6694+ return getInstructionCost (I, ElementCount::getFixed (1 )) *
6695+ VF.getKnownMinValue ();
6696+ }
6697+
66516698 Type *RetTy = I->getType ();
66526699 if (canTruncateToMinimalBitwidth (I, VF))
66536700 RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
@@ -6670,6 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
66706717 };
66716718 (void ) hasSingleCopyAfterVectorization;
66726719
6720+ Type *VectorTy;
66736721 if (isScalarAfterVectorization (I, VF)) {
66746722 // With the exception of GEPs and PHIs, after scalarization there should
66756723 // only be one copy of the instruction generated in the loop. This is
@@ -6685,6 +6733,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
66856733 } else
66866734 VectorTy = ToVectorTy (RetTy, VF);
66876735
6736+ if (VF.isVector () && VectorTy->isVectorTy () &&
6737+ !TTI.getNumberOfParts (VectorTy))
6738+ return InstructionCost::getInvalid ();
6739+
66886740 // TODO: We need to estimate the cost of intrinsic calls.
66896741 switch (I->getOpcode ()) {
66906742 case Instruction::GetElementPtr:
0 commit comments