@@ -974,13 +974,6 @@ class LoopVectorizationCostModel {
974974 // / 64 bit loop indices.
975975 std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
976976
977- // / \return The desired interleave count.
978- // / If interleave count has been specified by metadata it will be returned.
979- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
980- // / are the selected vectorization factor and the cost of the selected VF.
981- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
982- InstructionCost LoopCost);
983-
984977 // / Memory access instruction may be vectorized in more than one way.
985978 // / Form of instruction after vectorization depends on cost.
986979 // / This function takes cost-based decisions for Load/Store instructions
@@ -4653,8 +4646,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
46534646}
46544647
46554648unsigned
4656- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4657- InstructionCost LoopCost) {
4649+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4650+ InstructionCost LoopCost) {
46584651 // -- The interleave heuristics --
46594652 // We interleave the loop in order to expose ILP and reduce the loop overhead.
46604653 // There are many micro-architectural considerations that we can't predict
@@ -4669,11 +4662,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46694662 // 3. We don't interleave if we think that we will spill registers to memory
46704663 // due to the increased register pressure.
46714664
4672- if (!isScalarEpilogueAllowed ())
4665+ if (!CM. isScalarEpilogueAllowed ())
46734666 return 1 ;
46744667
4675- // Do not interleave if EVL is preferred and no User IC is specified.
4676- if ( foldTailWithEVL ( )) {
4668+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4669+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
46774670 LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
46784671 " Unroll factor forced to be 1.\n " );
46794672 return 1 ;
@@ -4686,15 +4679,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46864679 // We don't attempt to perform interleaving for loops with uncountable early
46874680 // exits because the VPInstruction::AnyOf code cannot currently handle
46884681 // multiple parts.
4689- if (Legal-> hasUncountableEarlyExit ())
4682+ if (Plan. hasEarlyExit ())
46904683 return 1 ;
46914684
4692- const bool HasReductions = !Legal->getReductionVars ().empty ();
4685+ const bool HasReductions =
4686+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4687+ IsaPred<VPReductionPHIRecipe>);
46934688
46944689 // If we did not calculate the cost for VF (because the user selected the VF)
46954690 // then we calculate the cost of VF here.
46964691 if (LoopCost == 0 ) {
4697- LoopCost = expectedCost (VF);
4692+ if (VF.isScalar ())
4693+ LoopCost = CM.expectedCost (VF);
4694+ else
4695+ LoopCost = cost (Plan, VF);
46984696 assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
46994697
47004698 // Loop body is free and there is no need for interleaving.
@@ -4703,7 +4701,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47034701 }
47044702
47054703 VPRegisterUsage R =
4706- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4704+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
47074705 // We divide by these constants so assume that we have at least one
47084706 // instruction that uses at least one register.
47094707 for (auto &Pair : R.MaxLocalUsers ) {
@@ -4766,23 +4764,24 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47664764
47674765 // Try to get the exact trip count, or an estimate based on profiling data or
47684766 // ConstantMax from PSE, failing that.
4769- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop );
4767+ auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop );
47704768
47714769 // For fixed length VFs treat a scalable trip count as unknown.
47724770 if (BestKnownTC && (BestKnownTC->isFixed () || VF.isScalable ())) {
47734771 // Re-evaluate trip counts and VFs to be in the same numerical space.
4774- unsigned AvailableTC = estimateElementCount (*BestKnownTC, VScaleForTuning);
4775- unsigned EstimatedVF = estimateElementCount (VF, VScaleForTuning);
4772+ unsigned AvailableTC =
4773+ estimateElementCount (*BestKnownTC, CM.getVScaleForTuning ());
4774+ unsigned EstimatedVF = estimateElementCount (VF, CM.getVScaleForTuning ());
47764775
47774776 // At least one iteration must be scalar when this constraint holds. So the
47784777 // maximum available iterations for interleaving is one less.
4779- if (requiresScalarEpilogue (VF.isVector ()))
4778+ if (CM. requiresScalarEpilogue (VF.isVector ()))
47804779 --AvailableTC;
47814780
47824781 unsigned InterleaveCountLB = bit_floor (std::max (
47834782 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
47844783
4785- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4784+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
47864785 // If the best known trip count is exact, we select between two
47874786 // prospective ICs, where
47884787 //
@@ -4843,7 +4842,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48434842 // vectorized the loop we will have done the runtime check and so interleaving
48444843 // won't require further checks.
48454844 bool ScalarInterleavingRequiresPredication =
4846- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4845+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
48474846 return Legal->blockNeedsPredication (BB);
48484847 }));
48494848 bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4866,8 +4865,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48664865
48674866 // Interleave until store/load ports (estimated by max interleave count) are
48684867 // saturated.
4869- unsigned NumStores = Legal->getNumStores ();
4870- unsigned NumLoads = Legal->getNumLoads ();
4868+ unsigned NumStores = 0 ;
4869+ unsigned NumLoads = 0 ;
4870+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4871+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4872+ for (VPRecipeBase &R : *VPBB) {
4873+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4874+ NumLoads++;
4875+ continue ;
4876+ }
4877+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4878+ NumStores++;
4879+ continue ;
4880+ }
4881+
4882+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4883+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4884+ NumStores += StoreOps;
4885+ else
4886+ NumLoads += InterleaveR->getNumDefinedValues ();
4887+ continue ;
4888+ }
4889+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4890+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4891+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4892+ continue ;
4893+ }
4894+ if (isa<VPHistogramRecipe>(&R)) {
4895+ NumLoads++;
4896+ NumStores++;
4897+ continue ;
4898+ }
4899+ }
4900+ }
48714901 unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
48724902 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
48734903
@@ -4877,12 +4907,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48774907 // do the final reduction after the loop.
48784908 bool HasSelectCmpReductions =
48794909 HasReductions &&
4880- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4881- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4882- RecurKind RK = RdxDesc.getRecurrenceKind ();
4883- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4884- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4885- });
4910+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4911+ [](VPRecipeBase &R) {
4912+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4913+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4914+ RedR->getRecurrenceKind ()) ||
4915+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4916+ RedR->getRecurrenceKind ()));
4917+ });
48864918 if (HasSelectCmpReductions) {
48874919 LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
48884920 return 1 ;
@@ -4893,12 +4925,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48934925 // we're interleaving is inside another loop. For tree-wise reductions
48944926 // set the limit to 2, and for ordered reductions it's best to disable
48954927 // interleaving entirely.
4896- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4928+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
48974929 bool HasOrderedReductions =
4898- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4899- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4900- return RdxDesc.isOrdered ();
4901- });
4930+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4931+ [](VPRecipeBase &R) {
4932+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4933+
4934+ return RedR && RedR->isOrdered ();
4935+ });
49024936 if (HasOrderedReductions) {
49034937 LLVM_DEBUG (
49044938 dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10122,7 +10156,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1012210156 GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
1012310157 if (LVP.hasPlanWithVF (VF.Width )) {
1012410158 // Select the interleave count.
10125- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10159+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
1012610160
1012710161 unsigned SelectedIC = std::max (IC, UserIC);
1012810162 // Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments