@@ -1505,6 +1505,11 @@ class LoopVectorizationCostModel {
15051505 ElementCount UserVF,
15061506 bool FoldTailByMasking);
15071507
1508+ // / If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1509+ // / MaxTripCount.
1510+ ElementCount clampVFByMaxTripCount (ElementCount VF, unsigned MaxTripCount,
1511+ bool FoldTailByMasking) const ;
1512+
15081513 // / \return the maximized element count based on the targets vector
15091514 // / registers and the loop trip-count, but limited to a maximum safe VF.
15101515 // / This is a helper function of computeFeasibleMaxVF.
@@ -3854,6 +3859,38 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
38543859 Legal->hasVectorCallVariants ())));
38553860}
38563861
3862+ ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount (
3863+ ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3864+ unsigned EstimatedVF = VF.getKnownMinValue ();
3865+ if (VF.isScalable () && TheFunction->hasFnAttribute (Attribute::VScaleRange)) {
3866+ auto Attr = TheFunction->getFnAttribute (Attribute::VScaleRange);
3867+ auto Min = Attr.getVScaleRangeMin ();
3868+ EstimatedVF *= Min;
3869+ }
3870+
3871+ // When a scalar epilogue is required, at least one iteration of the scalar
3872+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3873+ // max VF that results in a dead vector loop.
3874+ if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
3875+ MaxTripCount -= 1 ;
3876+
3877+ if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3878+ (!FoldTailByMasking || isPowerOf2_32 (MaxTripCount))) {
3879+ // If upper bound loop trip count (TC) is known at compile time there is no
3880+ // point in choosing VF greater than TC (as done in the loop below). Select
3881+ // maximum power of two which doesn't exceed TC. If VF is
3882+ // scalable, we only fall back on a fixed VF when the TC is less than or
3883+ // equal to the known number of lanes.
3884+ auto ClampedUpperTripCount = llvm::bit_floor (MaxTripCount);
3885+ LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
3886+ " exceeding the constant trip count: "
3887+ << ClampedUpperTripCount << " \n " );
3888+ return ElementCount::get (ClampedUpperTripCount,
3889+ FoldTailByMasking ? VF.isScalable () : false );
3890+ }
3891+ return VF;
3892+ }
3893+
38573894ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget (
38583895 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
38593896 ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3885,40 +3922,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
38853922 return ElementCount::getFixed (1 );
38863923 }
38873924
3888- unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue ();
3889- if (MaxVectorElementCount.isScalable () &&
3890- TheFunction->hasFnAttribute (Attribute::VScaleRange)) {
3891- auto Attr = TheFunction->getFnAttribute (Attribute::VScaleRange);
3892- auto Min = Attr.getVScaleRangeMin ();
3893- WidestRegisterMinEC *= Min;
3894- }
3895-
3896- // When a scalar epilogue is required, at least one iteration of the scalar
3897- // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3898- // max VF that results in a dead vector loop.
3899- if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
3900- MaxTripCount -= 1 ;
3901-
3902- if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3903- (!FoldTailByMasking || isPowerOf2_32 (MaxTripCount))) {
3904- // If upper bound loop trip count (TC) is known at compile time there is no
3905- // point in choosing VF greater than TC (as done in the loop below). Select
3906- // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3907- // scalable, we only fall back on a fixed VF when the TC is less than or
3908- // equal to the known number of lanes.
3909- auto ClampedUpperTripCount = llvm::bit_floor (MaxTripCount);
3910- LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
3911- " exceeding the constant trip count: "
3912- << ClampedUpperTripCount << " \n " );
3913- return ElementCount::get (
3914- ClampedUpperTripCount,
3915- FoldTailByMasking ? MaxVectorElementCount.isScalable () : false );
3916- }
3925+ ElementCount MaxVF = clampVFByMaxTripCount (MaxVectorElementCount,
3926+ MaxTripCount, FoldTailByMasking);
3927+ if (MaxVF != MaxVectorElementCount)
3928+ return MaxVF;
39173929
39183930 TargetTransformInfo::RegisterKind RegKind =
39193931 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
39203932 : TargetTransformInfo::RGK_FixedWidthVector;
3921- ElementCount MaxVF = MaxVectorElementCount;
39223933
39233934 if (MaxVF.isScalable ())
39243935 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
@@ -3940,6 +3951,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
39403951 }
39413952 }
39423953
3954+ MaxVF = clampVFByMaxTripCount (MaxVF, MaxTripCount, FoldTailByMasking);
3955+
39433956 // Invalidate any widening decisions we might have made, in case the loop
39443957 // requires prediction (decided later), but we have already made some
39453958 // load/store widening decisions.
0 commit comments