@@ -1095,6 +1095,7 @@ class LoopVectorizationCostModel {
10951095 CM_Widen_Reverse, // For consecutive accesses with stride -1.
10961096 CM_Interleave,
10971097 CM_GatherScatter,
1098+ CM_Compressed,
10981099 CM_Scalarize,
10991100 CM_VectorCall,
11001101 CM_IntrinsicCall
@@ -1308,9 +1309,9 @@ class LoopVectorizationCostModel {
13081309 getDivRemSpeculationCost (Instruction *I,
13091310 ElementCount VF) const ;
13101311
1311- // / Returns widening decision (CM_Widen or CM_Widen_Reverse ) if \p I is a
1312- // / memory instruction with consecutive access that can be widened, or
1313- // / CM_Unknown otherwise.
1312+ // / Returns widening decision (CM_Widen, CM_Widen_Reverse or CM_Compressed ) if
1313+ // / \p I is a memory instruction with consecutive access that can be widened,
1314+ // / or CM_Unknown otherwise.
13141315 InstWidening memoryInstructionCanBeWidened (Instruction *I, ElementCount VF);
13151316
13161317 // / Returns true if \p I is a memory instruction in an interleaved-group
@@ -3263,6 +3264,9 @@ LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
32633264 auto *Ptr = getLoadStorePointerOperand (I);
32643265 auto *ScalarTy = getLoadStoreType (I);
32653266
3267+ if (Legal->isCompressedPtr (ScalarTy, Ptr, I->getParent ()))
3268+ return CM_Compressed;
3269+
32663270 // In order to be widened, the pointer should be consecutive, first of all.
32673271 auto Stride = Legal->isConsecutivePtr (ScalarTy, Ptr);
32683272 if (!Stride)
@@ -3372,9 +3376,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
33723376 if (IsUniformMemOpUse (I))
33733377 return true ;
33743378
3375- return (WideningDecision == CM_Widen ||
3376- WideningDecision == CM_Widen_Reverse ||
3377- WideningDecision == CM_Interleave );
3379+ return (
3380+ WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
3381+ WideningDecision == CM_Interleave || WideningDecision == CM_Compressed );
33783382 };
33793383
33803384 // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -3514,6 +3518,39 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
35143518 AddToWorklistIfAllowed (IndUpdate);
35153519 }
35163520
3521+ // Handle monotonic phis (similarly to induction vars).
3522+ for (const auto &MonotonicPHI : Legal->getMonotonicPHIs ()) {
3523+ auto *Phi = MonotonicPHI.first ;
3524+ auto *PhiUpdate = cast<Instruction>(Phi->getIncomingValueForBlock (Latch));
3525+ const auto &Desc = MonotonicPHI.second ;
3526+
3527+ auto UniformPhi = llvm::all_of (Phi->users (), [&](User *U) -> bool {
3528+ auto *I = cast<Instruction>(U);
3529+ if (I == Desc.getStepInst ())
3530+ return true ;
3531+ if (auto *PN = dyn_cast<PHINode>(I); PN && Desc.getChain ().contains (PN))
3532+ return true ;
3533+ return !TheLoop->contains (I) || Worklist.count (I) ||
3534+ IsVectorizedMemAccessUse (I, Phi);
3535+ });
3536+ if (!UniformPhi)
3537+ continue ;
3538+
3539+ auto UniformPhiUpdate =
3540+ llvm::all_of (PhiUpdate->users (), [&](User *U) -> bool {
3541+ auto *I = cast<Instruction>(U);
3542+ if (I == Phi)
3543+ return true ;
3544+ return !TheLoop->contains (I) || Worklist.count (I) ||
3545+ IsVectorizedMemAccessUse (I, Phi);
3546+ });
3547+ if (!UniformPhiUpdate)
3548+ continue ;
3549+
3550+ AddToWorklistIfAllowed (Phi);
3551+ AddToWorklistIfAllowed (PhiUpdate);
3552+ }
3553+
35173554 Uniforms[VF].insert_range (Worklist);
35183555}
35193556
@@ -4272,6 +4309,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
42724309 case VPDef::VPEVLBasedIVPHISC:
42734310 case VPDef::VPPredInstPHISC:
42744311 case VPDef::VPBranchOnMaskSC:
4312+ case VPDef::VPMonotonicPHISC:
42754313 continue ;
42764314 case VPDef::VPReductionSC:
42774315 case VPDef::VPActiveLaneMaskPHISC:
@@ -4992,6 +5030,10 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
49925030 if (Legal->hasUncountableEarlyExit ())
49935031 return 1 ;
49945032
5033+ // Monotonic vars don't support interleaving.
5034+ if (Legal->hasMonotonicPHIs ())
5035+ return 1 ;
5036+
49955037 const bool HasReductions = !Legal->getReductionVars ().empty ();
49965038
49975039 // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5577,12 +5619,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
55775619 Instruction *I, ElementCount VF, InstWidening Decision) {
55785620 Type *ValTy = getLoadStoreType (I);
55795621 auto *VectorTy = cast<VectorType>(toVectorTy (ValTy, VF));
5622+ const Align Alignment = getLoadStoreAlignment (I);
55805623 unsigned AS = getLoadStoreAddressSpace (I);
55815624 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
55825625
5626+ if (Decision == CM_Compressed)
5627+ return TTI.getExpandCompressMemoryOpCost (I->getOpcode (), VectorTy,
5628+ /* VariableMask*/ true , Alignment,
5629+ CostKind, I);
5630+
55835631 assert ((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
55845632 " Expected widen decision." );
5585- const Align Alignment = getLoadStoreAlignment (I);
55865633 InstructionCost Cost = 0 ;
55875634 if (Legal->isMaskRequired (I)) {
55885635 Cost += TTI.getMaskedMemoryOpCost (I->getOpcode (), VectorTy, Alignment, AS,
@@ -6292,6 +6339,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
62926339 // the scalar version.
62936340 if (isUniformAfterVectorization (I, VF))
62946341 VF = ElementCount::getFixed (1 );
6342+ else if (auto *Phi = dyn_cast<PHINode>(I)) {
6343+ // Prohibit scalarization of monotonic phis.
6344+ if (Legal->isMonotonicPHI (Phi))
6345+ return InstructionCost::getInvalid ();
6346+ }
62956347
62966348 if (VF.isVector () && isProfitableToScalarize (I, VF))
62976349 return InstsToScalarize[VF][I];
@@ -6647,6 +6699,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
66476699 switch (getWideningDecision (I, VF)) {
66486700 case LoopVectorizationCostModel::CM_GatherScatter:
66496701 return TTI::CastContextHint::GatherScatter;
6702+ case LoopVectorizationCostModel::CM_Compressed:
6703+ return TTI::CastContextHint::Compressed;
66506704 case LoopVectorizationCostModel::CM_Interleave:
66516705 return TTI::CastContextHint::Interleave;
66526706 case LoopVectorizationCostModel::CM_Scalarize:
@@ -7238,6 +7292,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72387292 }
72397293 }
72407294
7295+ for (const auto &[MonotonicPhi, MonotonicDesc] : Legal->getMonotonicPHIs ()) {
7296+ // TODO: currently, we restrict vectorization of non-uniform monotonic phis
7297+ // by reporting Invalid cost for it. This can be relaxed in future.
7298+ if (VF.isVector () && !CM.isUniformAfterVectorization (MonotonicPhi, VF))
7299+ Cost = InstructionCost::getInvalid ();
7300+ else
7301+ Cost += TTI.getCFInstrCost (Instruction::PHI, CostCtx.CostKind );
7302+ CostCtx.SkipCostComputation .insert (MonotonicPhi);
7303+ }
7304+
72417305 // Pre-compute the costs for branches except for the backedge, as the number
72427306 // of replicate regions in a VPlan may not directly match the number of
72437307 // branches, which would lead to different decisions.
@@ -8229,8 +8293,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
82298293 LoopVectorizationCostModel::InstWidening Decision =
82308294 CM.getWideningDecision (I, Range.Start );
82318295 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8296+ bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
82328297 bool Consecutive =
8233- Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8298+ Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen;
82348299
82358300 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0 ] : Operands[1 ];
82368301 if (Consecutive) {
@@ -8258,11 +8323,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
82588323 }
82598324 if (LoadInst *Load = dyn_cast<LoadInst>(I))
82608325 return new VPWidenLoadRecipe (*Load, Ptr, Mask, Consecutive, Reverse,
8261- VPIRMetadata (*Load, LVer), I->getDebugLoc ());
8326+ Compressed, VPIRMetadata (*Load, LVer),
8327+ I->getDebugLoc ());
82628328
82638329 StoreInst *Store = cast<StoreInst>(I);
82648330 return new VPWidenStoreRecipe (*Store, Ptr, Operands[0 ], Mask, Consecutive,
8265- Reverse, VPIRMetadata (*Store, LVer),
8331+ Reverse, Compressed, VPIRMetadata (*Store, LVer),
82668332 I->getDebugLoc ());
82678333}
82688334
@@ -8771,11 +8837,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
87718837 return Recipe;
87728838
87738839 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8774- assert ((Legal->isReductionVariable (Phi) ||
8840+ assert ((Legal->isMonotonicPHI (Phi) || Legal-> isReductionVariable (Phi) ||
87758841 Legal->isFixedOrderRecurrence (Phi)) &&
8776- " can only widen reductions and fixed-order recurrences here" );
8842+ " can only widen monotonic phis, reductions and fixed-order "
8843+ " recurrences here" );
87778844 VPValue *StartV = Operands[0 ];
8778- if (Legal->isReductionVariable (Phi)) {
8845+ Value *IncomingVal =
8846+ Phi->getIncomingValueForBlock (OrigLoop->getLoopPreheader ());
8847+ if (Legal->isMonotonicPHI (Phi)) {
8848+ const MonotonicDescriptor &Desc =
8849+ Legal->getMonotonicPHIs ().find (Phi)->second ;
8850+ assert (Desc.getExpr ()->getStart () == PSE.getSCEV (IncomingVal));
8851+ PhiRecipe = new VPMonotonicPHIRecipe (Phi, Desc, StartV);
8852+ } else if (Legal->isReductionVariable (Phi)) {
87798853 const RecurrenceDescriptor &RdxDesc =
87808854 Legal->getReductionVars ().find (Phi)->second ;
87818855 assert (RdxDesc.getRecurrenceStartValue () ==
@@ -9397,6 +9471,27 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
93979471 // bring the VPlan to its final state.
93989472 // ---------------------------------------------------------------------------
93999473
9474+ // Adjust the recipes for any monotonic phis.
9475+ for (VPRecipeBase &R : HeaderVPBB->phis ()) {
9476+ auto *MonotonicPhi = dyn_cast<VPMonotonicPHIRecipe>(&R);
9477+ if (!MonotonicPhi)
9478+ continue ;
9479+
9480+ auto &Desc = MonotonicPhi->getDescriptor ();
9481+ auto [EdgeSrc, EdgeDst] = Desc.getPredicateEdge ();
9482+ auto &SE = *PSE.getSE ();
9483+ auto *Step = vputils::getOrCreateVPValueForSCEVExpr (
9484+ *Plan, Desc.getExpr ()->getStepRecurrence (SE), SE);
9485+
9486+ auto *MonotonicI = new VPInstruction (
9487+ VPInstruction::ComputeMonotonicResult,
9488+ {MonotonicPhi, RecipeBuilder.getEdgeMask (EdgeSrc, EdgeDst), Step},
9489+ *Desc.getStepInst ());
9490+ auto *InsertBlock = MonotonicPhi->getBackedgeRecipe ().getParent ();
9491+ InsertBlock->insert (MonotonicI, InsertBlock->getFirstNonPhi ());
9492+ MonotonicPhi->getBackedgeValue ()->replaceAllUsesWith (MonotonicI);
9493+ }
9494+
94009495 // Adjust the recipes for any inloop reductions.
94019496 adjustRecipesForReductions (Plan, RecipeBuilder, Range.Start );
94029497
@@ -10587,6 +10682,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1058710682 IC = CM.selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
1058810683
1058910684 unsigned SelectedIC = std::max (IC, UserIC);
10685+
10686+ if (LVL.hasMonotonicPHIs () && SelectedIC > 1 ) {
10687+ reportVectorizationFailure (
10688+ " Interleaving of loop with monotonic vars" ,
10689+ " Interleaving of loops with monotonic vars is not supported" ,
10690+ " CantInterleaveWithMonotonicVars" , ORE, L);
10691+ return false ;
10692+ }
10693+
1059010694 // Optimistically generate runtime checks if they are needed. Drop them if
1059110695 // they turn out to not be profitable.
1059210696 if (VF.Width .isVector () || SelectedIC > 1 )
0 commit comments