Skip to content

Commit e562ff0

Browse files
wlei-llvmtstellar
authored andcommitted
[CSSPGO][llvm-profgen] Aggregate samples on call frame trie to speed up profile generation
For CS profile generation, the process of call stack unwinding is time-consuming since for each LBR entry we need linear time to generate the context( hash, compression, string concatenation). This change speeds up this by grouping all the call frame within one LBR sample into a trie and aggregating the result(sample counter) on it, deferring the context compression and string generation to the end of unwinding. Specifically, it uses `StackLeaf` as the top frame on the stack and manipulates(pop or push a trie node) it dynamically during virtual unwinding so that the raw sample can just be recoded on the leaf node, the path(root to leaf) will represent its calling context. In the end, it traverses the trie and generates the context on the fly. Results: Our internal branch shows about 5X speed-up on some large workloads in SPEC06 benchmark. Differential Revision: https://reviews.llvm.org/D94110
1 parent 6209b07 commit e562ff0

File tree

4 files changed

+232
-108
lines changed

4 files changed

+232
-108
lines changed

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 95 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
2828
// 2nd frame is in prolog/epilog. In the future, we will switch to
2929
// pro/epi tracker(Dwarf CFI) for the precise check.
3030
uint64_t Source = State.getCurrentLBRSource();
31-
auto Iter = State.CallStack.begin();
32-
if (State.CallStack.size() == 1 || *(++Iter) != Source) {
33-
State.CallStack.front() = Source;
31+
auto *ParentFrame = State.getParentFrame();
32+
if (ParentFrame == State.getDummyRootPtr() ||
33+
ParentFrame->Address != Source) {
34+
State.switchToFrame(Source);
3435
} else {
35-
State.CallStack.pop_front();
36+
State.popFrame();
3637
}
3738
State.InstPtr.update(Source);
3839
}
@@ -41,116 +42,140 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
4142
InstructionPointer &IP = State.InstPtr;
4243
uint64_t Target = State.getCurrentLBRTarget();
4344
uint64_t End = IP.Address;
44-
if (State.getBinary()->usePseudoProbes()) {
45+
if (Binary->usePseudoProbes()) {
46+
// We don't need to top frame probe since it should be extracted
47+
// from the range.
4548
// The outcome of the virtual unwinding with pseudo probes is a
4649
// map from a context key to the address range being unwound.
4750
// This means basically linear unwinding is not needed for pseudo
4851
// probes. The range will be simply recorded here and will be
4952
// converted to a list of pseudo probes to report in ProfileGenerator.
50-
recordRangeCount(Target, End, State, Repeat);
53+
State.getParentFrame()->recordRangeCount(Target, End, Repeat);
5154
} else {
5255
// Unwind linear execution part
56+
uint64_t LeafAddr = State.CurrentLeafFrame->Address;
5357
while (IP.Address >= Target) {
5458
uint64_t PrevIP = IP.Address;
5559
IP.backward();
5660
// Break into segments for implicit call/return due to inlining
57-
bool SameInlinee =
58-
State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
61+
bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address);
5962
if (!SameInlinee || PrevIP == Target) {
60-
recordRangeCount(PrevIP, End, State, Repeat);
63+
State.switchToFrame(LeafAddr);
64+
State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat);
6165
End = IP.Address;
6266
}
63-
State.CallStack.front() = IP.Address;
67+
LeafAddr = IP.Address;
6468
}
6569
}
6670
}
6771

6872
void VirtualUnwinder::unwindReturn(UnwindState &State) {
6973
// Add extra frame as we unwind through the return
7074
const LBREntry &LBR = State.getCurrentLBR();
71-
uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target);
72-
State.CallStack.front() = CallAddr;
73-
State.CallStack.push_front(LBR.Source);
75+
uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target);
76+
State.switchToFrame(CallAddr);
77+
State.pushFrame(LBR.Source);
7478
State.InstPtr.update(LBR.Source);
7579
}
7680

7781
void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
7882
// TODO: Tolerate tail call for now, as we may see tail call from libraries.
7983
// This is only for intra function branches, excluding tail calls.
8084
uint64_t Source = State.getCurrentLBRSource();
81-
State.CallStack.front() = Source;
85+
State.switchToFrame(Source);
8286
State.InstPtr.update(Source);
8387
}
8488

85-
SampleCounter &
86-
VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary,
87-
std::list<uint64_t> &CallStack) {
88-
if (Binary->usePseudoProbes()) {
89-
return getOrCreateCounterForProbe(Binary, CallStack);
90-
}
89+
std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
9190
std::shared_ptr<StringBasedCtxKey> KeyStr =
9291
std::make_shared<StringBasedCtxKey>();
93-
KeyStr->Context = Binary->getExpandedContextStr(CallStack);
92+
KeyStr->Context = Binary->getExpandedContextStr(Stack);
9493
KeyStr->genHashCode();
95-
auto Ret =
96-
CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
97-
return Ret.first->second;
94+
return KeyStr;
9895
}
9996

100-
SampleCounter &
101-
VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
102-
std::list<uint64_t> &CallStack) {
97+
std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
10398
std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
10499
std::make_shared<ProbeBasedCtxKey>();
105-
if (CallStack.size() > 1) {
106-
// We don't need to top frame probe since it should be extracted
107-
// from the range.
108-
// The top of stack is an instruction from the function where
109-
// the LBR address range physcially resides. Strip it since
110-
// the function is not a part of the call context. We also
111-
// don't need its inline context since the probes being unwound
112-
// come with an inline context all the way back to the uninlined
113-
// function in their prefix tree.
114-
auto Iter = CallStack.rbegin();
115-
auto EndT = std::prev(CallStack.rend());
116-
for (; Iter != EndT; Iter++) {
117-
uint64_t Address = *Iter;
118-
const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address);
119-
// We may not find a probe for a merged or external callsite.
120-
// Callsite merging may cause the loss of original probe IDs.
121-
// Cutting off the context from here since the inline will
122-
// not know how to consume a context with unknown callsites.
123-
if (!CallProbe)
124-
break;
125-
ProbeBasedKey->Probes.emplace_back(CallProbe);
126-
}
100+
for (auto CallProbe : Stack) {
101+
ProbeBasedKey->Probes.emplace_back(CallProbe);
127102
}
128103
CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
129104
ProbeBasedKey->Probes);
130105
ProbeBasedKey->genHashCode();
131-
Hashable<ContextKey> ContextId(ProbeBasedKey);
132-
auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
133-
return Ret.first->second;
106+
return ProbeBasedKey;
107+
}
108+
109+
template <typename T>
110+
void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
111+
T &Stack) {
112+
if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
113+
return;
114+
115+
std::shared_ptr<ContextKey> Key = Stack.getContextKey();
116+
auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
117+
SampleCounter &SCounter = Ret.first->second;
118+
for (auto &Item : Cur->RangeSamples) {
119+
uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
120+
uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
121+
SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item));
122+
}
123+
124+
for (auto &Item : Cur->BranchSamples) {
125+
uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
126+
uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
127+
SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item));
128+
}
129+
}
130+
131+
template <typename T>
132+
void VirtualUnwinder::collectSamplesFromFrameTrie(
133+
UnwindState::ProfiledFrame *Cur, T &Stack) {
134+
if (!Cur->isDummyRoot()) {
135+
if (!Stack.pushFrame(Cur)) {
136+
// Process truncated context
137+
for (const auto &Item : Cur->Children) {
138+
// Start a new traversal ignoring its bottom context
139+
collectSamplesFromFrameTrie(Item.second.get());
140+
}
141+
return;
142+
}
143+
}
144+
145+
collectSamplesFromFrame(Cur, Stack);
146+
// Process children frame
147+
for (const auto &Item : Cur->Children) {
148+
collectSamplesFromFrameTrie(Item.second.get(), Stack);
149+
}
150+
// Recover the call stack
151+
Stack.popFrame();
134152
}
135153

136-
void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
137-
UnwindState &State, uint64_t Repeat) {
138-
uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
139-
uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
140-
SampleCounter &SCounter =
141-
getOrCreateCounter(State.getBinary(), State.CallStack);
142-
SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
154+
void VirtualUnwinder::collectSamplesFromFrameTrie(
155+
UnwindState::ProfiledFrame *Cur) {
156+
if (Binary->usePseudoProbes()) {
157+
ProbeStack Stack(Binary);
158+
collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack);
159+
} else {
160+
FrameStack Stack(Binary);
161+
collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
162+
}
143163
}
144164

145165
void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
146166
UnwindState &State, uint64_t Repeat) {
147167
if (Branch.IsArtificial)
148168
return;
149-
uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
150-
uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
151-
SampleCounter &SCounter =
152-
getOrCreateCounter(State.getBinary(), State.CallStack);
153-
SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
169+
170+
if (Binary->usePseudoProbes()) {
171+
// Same as recordRangeCount, We don't need to top frame probe since we will
172+
// extract it from branch's source address
173+
State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target,
174+
Repeat);
175+
} else {
176+
State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target,
177+
Repeat);
178+
}
154179
}
155180

156181
bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
@@ -199,6 +224,8 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
199224
// Record `branch` with calling context after unwinding.
200225
recordBranchCount(Branch, State, Repeat);
201226
}
227+
// As samples are aggregated on trie, record them into counter map
228+
collectSamplesFromFrameTrie(State.getDummyRootPtr());
202229

203230
return true;
204231
}
@@ -325,7 +352,8 @@ void PerfReader::printUnwinderOutput() {
325352
void PerfReader::unwindSamples() {
326353
for (const auto &Item : AggregatedSamples) {
327354
const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
328-
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
355+
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary],
356+
Sample->Binary);
329357
Unwinder.unwind(Sample, Item.second);
330358
}
331359

@@ -334,7 +362,7 @@ void PerfReader::unwindSamples() {
334362
}
335363

336364
bool PerfReader::extractLBRStack(TraceStream &TraceIt,
337-
SmallVector<LBREntry, 16> &LBRStack,
365+
SmallVectorImpl<LBREntry> &LBRStack,
338366
ProfiledBinary *Binary) {
339367
// The raw format of LBR stack is like:
340368
// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
@@ -398,7 +426,7 @@ bool PerfReader::extractLBRStack(TraceStream &TraceIt,
398426
}
399427

400428
bool PerfReader::extractCallstack(TraceStream &TraceIt,
401-
std::list<uint64_t> &CallStack) {
429+
SmallVectorImpl<uint64_t> &CallStack) {
402430
// The raw format of call stack is like:
403431
// 4005dc # leaf frame
404432
// 400634

0 commit comments

Comments
 (0)