-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LoopPeel] Fix branch weights' effect on block frequencies #128785
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
f413520
f821eeb
af8ec56
a0264ad
fd29a49
6303177
bbd0e95
715cb0a
67fa67d
37ce859
4337dcd
5193158
bbd2f22
b23f467
13d1fbb
e250cfc
859b84d
db5920a
47fbe85
f8097fb
7b27203
4c4669a
0f40efd
2791a1c
6148922
3f6a91a
e5a0a26
3a49b43
c283ebe
2f7daa8
ecbf6e0
c627fc5
f1fa8d9
38ace1e
92ddaa0
a3e0d72
67f22cd
f0ff2e2
69fe051
e7eb1fe
e4f68c3
0973ab3
680bdc2
83531b3
59cd184
47051ce
5d00250
5719779
98cab7b
3cbe07d
b3831b6
59ab013
b8aed9b
5c9e43e
83ac767
1f81310
04c8ade
a1a5460
f5b1885
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -651,84 +651,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, | |
} | ||
} | ||
|
||
struct WeightInfo { | ||
// Weights for current iteration. | ||
SmallVector<uint32_t> Weights; | ||
// Weights to subtract after each iteration. | ||
const SmallVector<uint32_t> SubWeights; | ||
}; | ||
|
||
/// Update the branch weights of an exiting block of a peeled-off loop | ||
/// iteration. | ||
/// Let F is a weight of the edge to continue (fallthrough) into the loop. | ||
/// Let E is a weight of the edge to an exit. | ||
/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to | ||
/// go to exit. | ||
/// Then, Estimated ExitCount = F / E. | ||
/// For I-th (counting from 0) peeled off iteration we set the weights for | ||
/// the peeled exit as (EC - I, 1). It gives us reasonable distribution, | ||
/// The probability to go to exit 1/(EC-I) increases. At the same time | ||
/// the estimated exit count in the remainder loop reduces by I. | ||
/// To avoid dealing with division rounding we can just multiple both part | ||
/// of weights to E and use weight as (F - I * E, E). | ||
static void updateBranchWeights(Instruction *Term, WeightInfo &Info) { | ||
setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); | ||
for (auto [Idx, SubWeight] : enumerate(Info.SubWeights)) | ||
if (SubWeight != 0) | ||
// Don't set the probability of taking the edge from latch to loop header | ||
// to less than 1:1 ratio (meaning Weight should not be lower than | ||
// SubWeight), as this could significantly reduce the loop's hotness, | ||
// which would be incorrect in the case of underestimating the trip count. | ||
Info.Weights[Idx] = | ||
Info.Weights[Idx] > SubWeight | ||
? std::max(Info.Weights[Idx] - SubWeight, SubWeight) | ||
: SubWeight; | ||
} | ||
|
||
/// Initialize the weights for all exiting blocks. | ||
static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos, | ||
Loop *L) { | ||
SmallVector<BasicBlock *> ExitingBlocks; | ||
L->getExitingBlocks(ExitingBlocks); | ||
for (BasicBlock *ExitingBlock : ExitingBlocks) { | ||
Instruction *Term = ExitingBlock->getTerminator(); | ||
SmallVector<uint32_t> Weights; | ||
if (!extractBranchWeights(*Term, Weights)) | ||
continue; | ||
|
||
// See the comment on updateBranchWeights() for an explanation of what we | ||
// do here. | ||
uint32_t FallThroughWeights = 0; | ||
uint32_t ExitWeights = 0; | ||
for (auto [Succ, Weight] : zip(successors(Term), Weights)) { | ||
if (L->contains(Succ)) | ||
FallThroughWeights += Weight; | ||
else | ||
ExitWeights += Weight; | ||
} | ||
|
||
// Don't try to update weights for degenerate case. | ||
if (FallThroughWeights == 0) | ||
continue; | ||
|
||
SmallVector<uint32_t> SubWeights; | ||
for (auto [Succ, Weight] : zip(successors(Term), Weights)) { | ||
if (!L->contains(Succ)) { | ||
// Exit weights stay the same. | ||
SubWeights.push_back(0); | ||
continue; | ||
} | ||
|
||
// Subtract exit weights on each iteration, distributed across all | ||
// fallthrough edges. | ||
double W = (double)Weight / (double)FallThroughWeights; | ||
SubWeights.push_back((uint32_t)(ExitWeights * W)); | ||
} | ||
|
||
WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}}); | ||
} | ||
} | ||
|
||
/// Clones the body of the loop L, putting it between \p InsertTop and \p | ||
/// InsertBot. | ||
/// \param IterNumber The serial number of the iteration currently being | ||
|
@@ -1002,11 +924,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, | |
Instruction *LatchTerm = | ||
cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator()); | ||
|
||
// If we have branch weight information, we'll want to update it for the | ||
// newly created branches. | ||
DenseMap<Instruction *, WeightInfo> Weights; | ||
initBranchWeights(Weights, L); | ||
|
||
// Identify what noalias metadata is inside the loop: if it is inside the | ||
// loop, the associated metadata must be cloned for each iteration. | ||
SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes; | ||
|
@@ -1034,11 +951,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, | |
assert(DT.verify(DominatorTree::VerificationLevel::Fast)); | ||
#endif | ||
|
||
for (auto &[Term, Info] : Weights) { | ||
auto *TermCopy = cast<Instruction>(VMap[Term]); | ||
updateBranchWeights(TermCopy, Info); | ||
} | ||
|
||
// Remove Loop metadata from the latch branch instruction | ||
// because it is not the Loop's latch branch anymore. | ||
auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]); | ||
|
@@ -1064,15 +976,62 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, | |
PHI->setIncomingValueForBlock(NewPreHeader, NewVal); | ||
} | ||
|
||
for (const auto &[Term, Info] : Weights) { | ||
setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); | ||
} | ||
|
||
// Update Metadata for count of peeled off iterations. | ||
unsigned AlreadyPeeled = 0; | ||
if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) | ||
AlreadyPeeled = *Peeled; | ||
addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount); | ||
unsigned TotalPeeled = AlreadyPeeled + PeelCount; | ||
addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled); | ||
|
||
// Update metadata for the estimated trip count. The original branch weight | ||
// metadata is already correct for both the remaining loop and the peeled loop | ||
// iterations, so don't adjust it. | ||
// | ||
// For example, consider what happens when peeling 2 iterations from a loop | ||
// with an estimated trip count of 10 and inserting them before the remaining | ||
// loop. Each of the peeled iterations and each iteration in the remaining | ||
// loop still has the same probability of exiting the *entire original* loop | ||
// as it did when in the original loop, and thus it should still have the same | ||
// branch weights. The peeled iterations' non-zero probabilities of exiting | ||
// already appropriately reduce the probability of reaching the remaining | ||
// iterations just as they did in the original loop. Trying to also adjust | ||
// the remaining loop's branch weights to reflect its new trip count of 8 will | ||
// erroneously further reduce its block frequencies. However, in case an | ||
// analysis later needs to determine the trip count of the remaining loop | ||
// while examining it in isolation without considering the probability of | ||
// actually reaching it, we store the new trip count as separate metadata. | ||
// | ||
// FIXME: getLoopEstimatedTripCount and setLoopEstimatedTripCount skip loops | ||
// that don't match the restrictions of getExpectedExitLoopLatchBranch in | ||
// LoopUtils.cpp. For example, | ||
// llvm/tests/Transforms/LoopUnroll/peel-branch-weights.ll (introduced by | ||
// b43a4d0850d5) has multiple exits. Should we try to extend them to handle | ||
// such cases? For now, we just don't try to record | ||
// llvm.loop.estimated_trip_count for such cases, so the original branch | ||
// weights will have to do. | ||
if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) { | ||
// FIXME: The previous updateBranchWeights implementation had this | ||
// comment: | ||
// | ||
// Don't set the probability of taking the edge from latch to loop header | ||
// to less than 1:1 ratio (meaning Weight should not be lower than | ||
// SubWeight), as this could significantly reduce the loop's hotness, | ||
// which would be incorrect in the case of underestimating the trip count. | ||
// | ||
// See e8d5db206c2f commit log for further discussion. That seems to | ||
// suggest that we should avoid ever setting a trip count of < 2 here | ||
// (equal chance of continuing and exiting means the loop will likely | ||
// continue once and then exit once). Or is keeping the original branch | ||
// weights already a sufficient improvement for whatever analysis cares | ||
// about this case? | ||
|
||
unsigned EstimatedTripCountNew = *EstimatedTripCount; | ||
if (EstimatedTripCountNew < TotalPeeled) // FIXME: TotalPeeled + 2? | ||
EstimatedTripCountNew = 0; // FIXME: = 2? | ||
else | ||
EstimatedTripCountNew -= TotalPeeled; | ||
setLoopEstimatedTripCount(L, EstimatedTripCountNew, | ||
/*EstimatedLoopInvocationWeight=*/std::nullopt); | ||
} | ||
|
||
if (Loop *ParentLoop = L->getParentLoop()) | ||
L = ParentLoop; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
; Test branch weight metadata, estimated trip count metadata, and block | ||
; frequencies after loop peeling. | ||
|
||
; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \ | ||
; RUN: FileCheck -check-prefix=CHECK %s | ||
|
||
; The -implicit-check-not options make sure that no additional labels or calls | ||
; to @f show up. | ||
; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \ | ||
; RUN: -unroll-force-peel-count=2 2>&1 | \ | ||
; RUN: FileCheck %s -check-prefix=CHECK-UR \ | ||
; RUN: -implicit-check-not='{{^[^ ;]*:}}' \ | ||
; RUN: -implicit-check-not='call void @f' | ||
|
||
; CHECK: block-frequency-info: test | ||
; CHECK: do.body: float = 10.0, | ||
|
||
; The sum should still be ~10. | ||
; | ||
; CHECK-UR: block-frequency-info: test | ||
; CHECK-UR: - [[DO_BODY_PEEL:.*]]: float = 1.0, | ||
; CHECK-UR: - [[DO_BODY_PEEL2:.*]]: float = 0.9, | ||
; CHECK-UR: - [[DO_BODY:.*]]: float = 8.1, | ||
|
||
declare void @f(i32) | ||
|
||
define void @test(i32 %n) { | ||
; CHECK-UR-LABEL: define void @test( | ||
; CHECK-UR: [[ENTRY:.*]]: | ||
; CHECK-UR: br label %[[DO_BODY_PEEL_BEGIN:.*]] | ||
; CHECK-UR: [[DO_BODY_PEEL_BEGIN]]: | ||
; CHECK-UR: br label %[[DO_BODY_PEEL:.*]] | ||
; CHECK-UR: [[DO_BODY_PEEL]]: | ||
; CHECK-UR: call void @f | ||
; CHECK-UR: br i1 %{{.*}}, label %[[DO_END:.*]], label %[[DO_BODY_PEEL_NEXT:.*]], !prof ![[#PROF:]] | ||
; CHECK-UR: [[DO_BODY_PEEL_NEXT]]: | ||
; CHECK-UR: br label %[[DO_BODY_PEEL2:.*]] | ||
; CHECK-UR: [[DO_BODY_PEEL2]]: | ||
; CHECK-UR: call void @f | ||
; CHECK-UR: br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_PEEL_NEXT1:.*]], !prof ![[#PROF]] | ||
; CHECK-UR: [[DO_BODY_PEEL_NEXT1]]: | ||
; CHECK-UR: br label %[[DO_BODY_PEEL_NEXT5:.*]] | ||
; CHECK-UR: [[DO_BODY_PEEL_NEXT5]]: | ||
; CHECK-UR: br label %[[ENTRY_PEEL_NEWPH:.*]] | ||
; CHECK-UR: [[ENTRY_PEEL_NEWPH]]: | ||
; CHECK-UR: br label %[[DO_BODY]] | ||
; CHECK-UR: [[DO_BODY]]: | ||
; CHECK-UR: call void @f | ||
; CHECK-UR: br i1 %{{.*}}, label %[[DO_END_LOOPEXIT:.*]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]] | ||
; CHECK-UR: [[DO_END_LOOPEXIT]]: | ||
; CHECK-UR: br label %[[DO_END]] | ||
; CHECK-UR: [[DO_END]]: | ||
; CHECK-UR: ret void | ||
|
||
entry: | ||
br label %do.body | ||
|
||
do.body: | ||
%i = phi i32 [ 0, %entry ], [ %inc, %do.body ] | ||
%inc = add i32 %i, 1 | ||
call void @f(i32 %i) | ||
%c = icmp sge i32 %inc, %n | ||
br i1 %c, label %do.end, label %do.body, !prof !0 | ||
|
||
do.end: | ||
ret void | ||
} | ||
|
||
!0 = !{!"branch_weights", i32 1, i32 9} | ||
|
||
; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9} | ||
; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_PC:]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]} | ||
; CHECK-UR: ![[#LOOP_UR_PC]] = !{!"llvm.loop.peeled.count", i32 2} | ||
; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 8} | ||
; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"} |
Uh oh!
There was an error while loading. Please reload this page.