Skip to content

Commit 8544164

Browse files
[LoopVectorize] Don't scalarize predicated instruction with optsize
Scalarizing predicated instructions results in a worse code size impact than having a scalar epilogue, which we already forbid with optsize, so we shouldn't allow it. A couple of notes on the implementation: * OptForSizeBasedOnProfile has been moved into the cost model and renamed to OptForSize, as shouldOptimizeForSize checks both the function attribute and profile. * We still allow tail folding if we don't need to scalarize any instructions, e.g. see foo_optsize in the test Transforms/LoopVectorize/X86/optsize.ll. This change requires a lot of test changes. Where a test is specifically testing scalarized predicated instructions I've adjusted it so it still does, either by removing optsize if it makes no difference or forcing tail predication to be enabled. For tests of optsize I've updated the test to check we're not scalarizing. Fixes #66652
1 parent 2639dea commit 8544164

16 files changed

+202
-802
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -491,12 +491,7 @@ class InnerLoopVectorizer {
491491
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
492492
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
493493
PSI(PSI), RTChecks(RTChecks), Plan(Plan),
494-
VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
495-
// Query this against the original loop and save it here because the profile
496-
// of the original loop header may change as the transformation happens.
497-
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
498-
OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
499-
}
494+
VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {}
500495

501496
virtual ~InnerLoopVectorizer() = default;
502497

@@ -669,10 +664,6 @@ class InnerLoopVectorizer {
669664
BlockFrequencyInfo *BFI;
670665
ProfileSummaryInfo *PSI;
671666

672-
// Whether this loop should be optimized for size based on profile guided size
673-
// optimizatios.
674-
bool OptForSizeBasedOnProfile;
675-
676667
/// Structure to hold information about generated runtime checks, responsible
677668
/// for cleaning the checks, if vectorization turns out unprofitable.
678669
GeneratedRTChecks &RTChecks;
@@ -986,13 +977,18 @@ class LoopVectorizationCostModel {
986977
AssumptionCache *AC,
987978
OptimizationRemarkEmitter *ORE, const Function *F,
988979
const LoopVectorizeHints *Hints,
989-
InterleavedAccessInfo &IAI)
980+
InterleavedAccessInfo &IAI,
981+
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
990982
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991983
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992984
Hints(Hints), InterleaveInfo(IAI) {
993985
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
994986
initializeVScaleForTuning();
995987
CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
988+
// Query this against the original loop and save it here because the profile
989+
// of the original loop header may change as the transformation happens.
990+
OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
991+
PGSOQueryType::IRPass);
996992
}
997993

998994
/// \return An upper bound for the vectorization factors (both fixed and
@@ -1833,6 +1829,10 @@ class LoopVectorizationCostModel {
18331829

18341830
/// The kind of cost that we are calculating
18351831
TTI::TargetCostKind CostKind;
1832+
1833+
/// Whether this loop should be optimized for size based on function attribute
1834+
/// or profile information.
1835+
bool OptForSize;
18361836
};
18371837
} // end namespace llvm
18381838

@@ -2612,9 +2612,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
26122612
if (!SCEVCheckBlock)
26132613
return nullptr;
26142614

2615-
assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2616-
(OptForSizeBasedOnProfile &&
2617-
Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2615+
assert((!Cost->OptForSize ||
2616+
Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
26182617
"Cannot SCEV check stride or overflow when optimizing for size");
26192618
assert(!LoopBypassBlocks.empty() &&
26202619
"Should already be a bypass block due to iteration count check");
@@ -2639,7 +2638,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
26392638
if (!MemCheckBlock)
26402639
return nullptr;
26412640

2642-
if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2641+
if (Cost->OptForSize) {
26432642
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
26442643
"Cannot emit memory checks when optimizing for size, unless forced "
26452644
"to vectorize.");
@@ -5518,6 +5517,9 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55185517
// includes the scalarization overhead of the predicated instruction.
55195518
InstructionCost VectorCost = getInstructionCost(I, VF);
55205519

5520+
if (VectorCost == InstructionCost::getInvalid())
5521+
continue;
5522+
55215523
// Compute the cost of the scalarized instruction. This cost is the cost of
55225524
// the instruction as if it wasn't if-converted and instead remained in the
55235525
// predicated block. We will scale this cost by block probability after
@@ -5660,6 +5662,13 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
56605662
if (VF.isScalable())
56615663
return InstructionCost::getInvalid();
56625664

5665+
// Don't scalarize predicated instructions when optimizing for size unless
5666+
// we're forced to.
5667+
if (isPredicatedInst(I) && OptForSize &&
5668+
!ForceTailFoldingStyle.getNumOccurrences() &&
5669+
Hints->getForce() != LoopVectorizeHints::FK_Enabled)
5670+
return InstructionCost::getInvalid();
5671+
56635672
Type *ValTy = getLoadStoreType(I);
56645673
auto *SE = PSE.getSE();
56655674

@@ -10090,7 +10099,7 @@ static bool processLoopInVPlanNativePath(
1009010099
getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
1009110100

1009210101
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10093-
&Hints, IAI);
10102+
&Hints, IAI, PSI, BFI);
1009410103
// Use the planner for outer loop vectorization.
1009510104
// TODO: CM is not used at this point inside the planner. Turn CM into an
1009610105
// optional argument if we don't need it in the future.
@@ -10627,7 +10636,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1062710636

1062810637
// Use the cost model.
1062910638
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10630-
F, &Hints, IAI);
10639+
F, &Hints, IAI, PSI, BFI);
1063110640
// Use the planner for vectorization.
1063210641
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
1063310642
ORE);

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,55 +1588,29 @@ exit:
15881588
ret void
15891589
}
15901590

1591-
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
1591+
define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
15921592
; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
1593-
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1593+
; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
15941594
; DEFAULT-NEXT: entry:
15951595
; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15961596
; DEFAULT: vector.ph:
15971597
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
15981598
; DEFAULT: vector.body:
1599-
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
1600-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
1601-
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
1599+
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1600+
; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1601+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
16021602
; DEFAULT-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
16031603
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
1604-
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
1605-
; DEFAULT-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
1606-
; DEFAULT: pred.store.if:
1607-
; DEFAULT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
1608-
; DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DST]], align 4
1609-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE]]
1610-
; DEFAULT: pred.store.continue:
1611-
; DEFAULT-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
1612-
; DEFAULT-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
1613-
; DEFAULT: pred.store.if1:
1614-
; DEFAULT-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
1615-
; DEFAULT-NEXT: store i32 [[TMP6]], ptr [[DST]], align 4
1616-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE2]]
1617-
; DEFAULT: pred.store.continue2:
1618-
; DEFAULT-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
1619-
; DEFAULT-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
1620-
; DEFAULT: pred.store.if3:
1621-
; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
1622-
; DEFAULT-NEXT: store i32 [[TMP8]], ptr [[DST]], align 4
1623-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE4]]
1624-
; DEFAULT: pred.store.continue4:
1625-
; DEFAULT-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
1626-
; DEFAULT-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
1627-
; DEFAULT: pred.store.if5:
16281604
; DEFAULT-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
16291605
; DEFAULT-NEXT: store i32 [[TMP10]], ptr [[DST]], align 4
1630-
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE6]]
1631-
; DEFAULT: pred.store.continue6:
1632-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1606+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
16331607
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
1634-
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
1635-
; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1608+
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
1609+
; DEFAULT-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
16361610
; DEFAULT: middle.block:
1637-
; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
1611+
; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
16381612
; DEFAULT: scalar.ph:
1639-
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1613+
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
16401614
; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]]
16411615
; DEFAULT: loop.header:
16421616
; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -1653,7 +1627,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
16531627
; DEFAULT-NEXT: ret void
16541628
;
16551629
; PRED-LABEL: define void @redundant_branch_and_tail_folding(
1656-
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
1630+
; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
16571631
; PRED-NEXT: entry:
16581632
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
16591633
; PRED: vector.ph:

0 commit comments

Comments
 (0)