Skip to content

Commit 4e33fb7

Browse files
committed
Add initial support for SPE brstack
Perf will be able to report SPE branch events as similar as it does with LBR brstack. Therefore we can utilize the existing LBR parsing process for SPE as well. Example of the SPE brstack input format: ```bash perf script -i perf.data -F pid,brstack --itrace=bl ``` ``` --- PID FROM TO PREDICTED --- 16984 0x72e342e5f4/0x72e36192d0/M/-/-/11/RET/- 16984 0x72e7b8b3b4/0x72e7b8b3b8/PN/-/-/11/COND/- 16984 0x72e7b92b48/0x72e7b92b4c/PN/-/-/8/COND/- 16984 0x72eacc6b7c/0x760cc94b00/P/-/-/9/RET/- 16984 0x72e3f210fc/0x72e3f21068/P/-/-/4//- 16984 0x72e39b8c5c/0x72e3627b24/P/-/-/4//- 16984 0x72e7b89d20/0x72e7b92bbc/P/-/-/4/RET/- ``` SPE brstack mispredicted flag might be two characters long: 'PN' or 'MN'. Where 'N' means the branch was marked as NOT-TAKEN. This event is only related to conditional instruction (conditional branch or compare-and-branch), it tells that failed its condition code check. Perf with 'brstack' support for SPE is available here: ``` https://github.com/Leo-Yan/linux/tree/perf_arm_spe_branch_flags_v2 ``` Example of useage with SPE perf data: ```bash perf2bolt -p perf.data -o perf.fdata --spe BINARY ``` Capture standard SPE branch events with perf: ```bash perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY ``` An unittest is also added to check parsing process of 'SPE brstack format'.
1 parent 607a5df commit 4e33fb7

File tree

3 files changed

+122
-32
lines changed

3 files changed

+122
-32
lines changed

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,10 @@ static cl::opt<bool>
4949
cl::desc("aggregate basic samples (without LBR info)"),
5050
cl::cat(AggregatorCategory));
5151

52-
cl::opt<bool> ArmSPE(
53-
"spe",
54-
cl::desc(
55-
"Enable Arm SPE mode. Used in conjuction with no-lbr mode, ie `--spe "
56-
"--nl`"),
57-
cl::cat(AggregatorCategory));
52+
cl::opt<bool> ArmSPE("spe",
53+
cl::desc("Enable Arm SPE mode. Can combine with `--nl` "
54+
"to use in no-lbr mode"),
55+
cl::cat(AggregatorCategory));
5856

5957
static cl::opt<std::string>
6058
ITraceAggregation("itrace",
@@ -190,13 +188,16 @@ void DataAggregator::start() {
190188

191189
if (opts::ArmSPE) {
192190
if (!opts::BasicAggregation) {
193-
errs() << "PERF2BOLT-ERROR: Arm SPE mode is combined only with "
194-
"BasicAggregation.\n";
195-
exit(1);
191+
// pid from_ip to_ip predicted/missed not-taken?
192+
// 12345 0x123/0x456/PN/-/-/8/RET/-
193+
launchPerfProcess("SPE brstack events", MainEventsPPI,
194+
"script -F pid,brstack --itrace=bl",
195+
/*Wait = */ false);
196+
} else {
197+
launchPerfProcess("SPE branch events (non-lbr)", MainEventsPPI,
198+
"script -F pid,event,ip,addr --itrace=i1i",
199+
/*Wait = */ false);
196200
}
197-
launchPerfProcess("branch events with SPE", MainEventsPPI,
198-
"script -F pid,event,ip,addr --itrace=i1i",
199-
/*Wait = */ false);
200201
} else if (opts::BasicAggregation) {
201202
launchPerfProcess("events without LBR", MainEventsPPI,
202203
"script -F pid,event,ip",
@@ -530,7 +531,7 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
530531
filterBinaryMMapInfo();
531532
prepareToParse("events", MainEventsPPI, ErrorCallback);
532533

533-
if (((!opts::BasicAggregation && !opts::ArmSPE) && parseBranchEvents()) ||
534+
if ((!opts::BasicAggregation && parseBranchEvents()) ||
534535
(opts::BasicAggregation && opts::ArmSPE && parseSpeAsBasicEvents()) ||
535536
(opts::BasicAggregation && parseBasicEvents()))
536537
errs() << "PERF2BOLT: failed to parse samples\n";
@@ -1016,9 +1017,20 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
10161017
if (std::error_code EC = MispredStrRes.getError())
10171018
return EC;
10181019
StringRef MispredStr = MispredStrRes.get();
1019-
if (MispredStr.size() != 1 ||
1020-
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
1021-
reportError("expected single char for mispred bit");
1020+
// SPE brstack mispredicted flags might be two characters long: 'PN' or 'MN'.
1021+
bool ValidStrSize = opts::ArmSPE ?
1022+
MispredStr.size() >= 1 && MispredStr.size() <= 2 : MispredStr.size() == 1;
1023+
bool SpeTakenBitErr =
1024+
(opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
1025+
bool PredictionBitErr =
1026+
!ValidStrSize ||
1027+
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
1028+
if (SpeTakenBitErr)
1029+
reportError("expected 'N' as SPE prediction bit for a not-taken branch");
1030+
if (PredictionBitErr)
1031+
reportError("expected 'P', 'M' or '-' char as a prediction bit");
1032+
1033+
if (SpeTakenBitErr || PredictionBitErr) {
10221034
Diag << "Found: " << MispredStr << "\n";
10231035
return make_error_code(llvm::errc::io_error);
10241036
}
@@ -1581,9 +1593,11 @@ void DataAggregator::printBranchStacksDiagnostics(
15811593
}
15821594

15831595
std::error_code DataAggregator::parseBranchEvents() {
1584-
outs() << "PERF2BOLT: parse branch events...\n";
1585-
NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
1586-
TimerGroupDesc, opts::TimeAggregator);
1596+
std::string BranchEventTypeStr =
1597+
opts::ArmSPE ? "branch events" : "SPE branch events in LBR-format";
1598+
outs() << "PERF2BOLT: " << BranchEventTypeStr << "...\n";
1599+
NamedRegionTimer T("parseBranch", "Parsing " + BranchEventTypeStr,
1600+
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
15871601

15881602
uint64_t NumEntries = 0;
15891603
uint64_t NumSamples = 0;
@@ -1609,7 +1623,8 @@ std::error_code DataAggregator::parseBranchEvents() {
16091623
}
16101624

16111625
NumEntries += Sample.LBR.size();
1612-
if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
1626+
if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
1627+
!NeedsSkylakeFix) {
16131628
errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
16141629
NeedsSkylakeFix = true;
16151630
}
@@ -1632,10 +1647,17 @@ std::error_code DataAggregator::parseBranchEvents() {
16321647
if (NumSamples && NumSamplesNoLBR == NumSamples) {
16331648
// Note: we don't know if perf2bolt is being used to parse memory samples
16341649
// at this point. In this case, it is OK to parse zero LBRs.
1635-
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
1636-
"LBR. Record profile with perf record -j any or run perf2bolt "
1637-
"in no-LBR mode with -nl (the performance improvement in -nl "
1638-
"mode may be limited)\n";
1650+
if (!opts::ArmSPE)
1651+
errs()
1652+
<< "PERF2BOLT-WARNING: all recorded samples for this binary lack "
1653+
"LBR. Record profile with perf record -j any or run perf2bolt "
1654+
"in no-LBR mode with -nl (the performance improvement in -nl "
1655+
"mode may be limited)\n";
1656+
else
1657+
errs()
1658+
<< "PERF2BOLT-WARNING: all recorded samples for this binary lack "
1659+
"SPE brstack entries. Record profile with:"
1660+
"perf record arm_spe_0/branch_filter=1/";
16391661
} else {
16401662
printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
16411663
}
Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
## Check that Arm SPE mode is available on AArch64 with BasicAggregation.
1+
## Check that Arm SPE mode is available on AArch64.
22

33
REQUIRES: system-linux,perf,target=aarch64{{.*}}
44

5-
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6-
RUN: touch %t.empty.perf.data
7-
RUN: perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR
5+
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
86

9-
CHECK-SPE-NO-LBR: PERF2BOLT: Starting data aggregation job
7+
RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe --nl %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR
108

11-
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe
12-
RUN: not perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-LBR
9+
RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR
10+
11+
CHECK-SPE-NO-LBR: PERF2BOLT: spawning perf job to read SPE branch events (non-lbr)
12+
CHECK-SPE-LBR: PERF2BOLT: spawning perf job to read SPE brstack events
1313

14-
CHECK-SPE-LBR: PERF2BOLT-ERROR: Arm SPE mode is combined only with BasicAggregation.

bolt/unittests/Profile/PerfSpeEvents.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ using namespace llvm::ELF;
2323

2424
namespace opts {
2525
extern cl::opt<std::string> ReadPerfEvents;
26+
extern cl::opt<bool> ArmSPE;
2627
} // namespace opts
2728

2829
namespace llvm {
@@ -38,6 +39,8 @@ struct PerfSpeEventsTestHelper : public testing::Test {
3839
}
3940

4041
protected:
42+
using LBREntry = DataAggregator::LBREntry;
43+
4144
void initalizeLLVM() {
4245
llvm::InitializeAllTargetInfos();
4346
llvm::InitializeAllTargetMCs();
@@ -88,6 +91,45 @@ struct PerfSpeEventsTestHelper : public testing::Test {
8891

8992
return SampleSize == DA.BasicSamples.size();
9093
}
94+
95+
/// Compare LBREntries
96+
bool checkLBREntry(const LBREntry &Lhs, const LBREntry &Rhs) {
97+
return Lhs.From == Rhs.From && Lhs.To == Rhs.To &&
98+
Lhs.Mispred == Rhs.Mispred;
99+
}
100+
101+
/// Parse and check SPE brstack as LBR
102+
void parseAndCheckBrstackEvents(
103+
uint64_t PID,
104+
const std::vector<SmallVector<LBREntry, 2>> &ExpectedSamples) {
105+
int NumSamples = 0;
106+
107+
DataAggregator DA("<pseudo input>");
108+
DA.ParsingBuf = opts::ReadPerfEvents;
109+
DA.BC = BC.get();
110+
DataAggregator::MMapInfo MMap;
111+
DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap));
112+
113+
// Process buffer.
114+
while (DA.hasData()) {
115+
ErrorOr<DataAggregator::PerfBranchSample> SampleRes =
116+
DA.parseBranchSample();
117+
if (std::error_code EC = SampleRes.getError())
118+
EXPECT_NE(EC, std::errc::no_such_process);
119+
120+
DataAggregator::PerfBranchSample &Sample = SampleRes.get();
121+
EXPECT_EQ(Sample.LBR.size(), ExpectedSamples[NumSamples].size());
122+
123+
// Check the parsed LBREntries.
124+
const auto *ActualIter = Sample.LBR.begin();
125+
const auto *ExpectIter = ExpectedSamples[NumSamples].begin();
126+
while (ActualIter != Sample.LBR.end() &&
127+
ExpectIter != ExpectedSamples[NumSamples].end())
128+
EXPECT_TRUE(checkLBREntry(*ActualIter++, *ExpectIter++));
129+
130+
++NumSamples;
131+
}
132+
}
91133
};
92134

93135
} // namespace bolt
@@ -113,6 +155,33 @@ TEST_F(PerfSpeEventsTestHelper, SpeBranches) {
113155
EXPECT_TRUE(checkEvents(1234, 10, {"branches-spe:"}));
114156
}
115157

158+
TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
159+
// Check perf input with SPE branch events as brstack format.
160+
// Example collection command:
161+
// ```
162+
// perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
163+
// ```
164+
// How Bolt extracts the branch events:
165+
// ```
166+
// perf script -F pid,brstack --itrace=bl
167+
// ```
168+
169+
opts::ArmSPE = true;
170+
opts::ReadPerfEvents = " 1234 0xa001/0xa002/PN/-/-/10/COND/-\n"
171+
" 1234 0xb001/0xb002/P/-/-/4/RET/-\n"
172+
" 1234 0xc001/0xc002/P/-/-/13/-/-\n"
173+
" 1234 0xd001/0xd002/M/-/-/7/RET/-\n"
174+
" 1234 0xe001/0xe002/P/-/-/14/RET/-\n"
175+
" 1234 0xf001/0xf002/MN/-/-/8/COND/-\n";
176+
177+
std::vector<SmallVector<LBREntry, 2>> ExpectedSamples = {
178+
{{{0xa001, 0xa002, false}}}, {{{0xb001, 0xb002, false}}},
179+
{{{0xc001, 0xc002, false}}}, {{{0xd001, 0xd002, true}}},
180+
{{{0xe001, 0xe002, false}}}, {{{0xf001, 0xf002, true}}},
181+
};
182+
parseAndCheckBrstackEvents(1234, ExpectedSamples);
183+
}
184+
116185
TEST_F(PerfSpeEventsTestHelper, SpeBranchesAndCycles) {
117186
// Check perf input with SPE branch events and cycles.
118187
// Example collection command:

0 commit comments

Comments
 (0)