Skip to content

Commit a203f8b

Browse files
paschalis-mpeiskaadam
authored andcommitted
[BOLT][AArch64] Introduce SPE mode in BasicAggregation
BOLT gains the ability to process branch target information generated by Arm SPE data, using the `BasicAggregation` format. Example usage is: ```bash perf2bolt -p perf.data -o perf.boltdata --nl --spe BINARY ``` New branch data and compatibility: --- SPE branch entries in perf data contain a branch pair (`IP` -> `ADDR`) for the source and destination branches. DataAggregator processes those by creating two basic samples. Any other event types will have `ADDR` field set to `0x0`. For those a single sample will be created. Such events can be either SPE or non-SPE, like `l1d-access` and `cycles` respectively. The format of the input perf entries is: ``` PID EVENT-TYPE ADDR IP ``` When on SPE mode and: - host is not `AArch64`, BOLT will exit with a relevant message - `ADDR` field is unavailable, BOLT will exit with a relevant message - no branch pairs were recorded, BOLT will present a warning Examples of generating profiling data for the SPE mode: --- Profiles can be captured with perf on AArch64 machines with SPE enabled. They can be combined with other events, SPE or not. Capture only SPE branch data events: ```bash perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY ``` Capture any SPE events: ```bash perf record -e 'arm_spe_0//u' -- BINARY ``` Capture any SPE events and cycles ```bash perf record -e 'arm_spe_0//u' -e cycles:u -- BINARY ``` More filters, jitter, and specify count to control overheads/quality. ```bash perf record -e 'arm_spe_0/branch_filter=1,load_filter=0,store_filter=0,jitter=1/u' -c 10007 -- BINARY ```
1 parent c0a9c90 commit a203f8b

File tree

7 files changed

+363
-8
lines changed

7 files changed

+363
-8
lines changed

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class DataAggregator : public DataReader {
8585
};
8686
friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);
8787

88+
friend struct PerfSpeEventsTestHelper;
89+
8890
struct PerfBranchSample {
8991
SmallVector<LBREntry, 32> LBR;
9092
};
@@ -295,6 +297,15 @@ class DataAggregator : public DataReader {
295297
/// and a PC
296298
ErrorOr<PerfBasicSample> parseBasicSample();
297299

300+
/// Parse an Arm SPE entry into the non-lbr format by generating two basic
301+
/// samples. The format of an input SPE entry is:
302+
/// ```
303+
/// PID EVENT-TYPE ADDR IP
304+
/// ```
305+
/// SPE branch events will have 'ADDR' set to a branch target address while
306+
/// other perf or SPE events will have it set to zero.
307+
ErrorOr<std::pair<PerfBasicSample,PerfBasicSample>> parseSpeAsBasicSamples();
308+
298309
/// Parse a single perf sample containing a PID associated with an IP and
299310
/// address.
300311
ErrorOr<PerfMemSample> parseMemSample();
@@ -341,6 +352,9 @@ class DataAggregator : public DataReader {
341352
/// Process non-LBR events.
342353
void processBasicEvents();
343354

355+
/// Parse Arm SPE events into the non-LBR format.
356+
std::error_code parseSpeAsBasicEvents();
357+
344358
/// Parse the full output generated by perf script to report memory events.
345359
std::error_code parseMemEvents();
346360

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 130 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ static cl::opt<bool>
4949
cl::desc("aggregate basic samples (without LBR info)"),
5050
cl::cat(AggregatorCategory));
5151

52+
cl::opt<bool> ArmSPE(
53+
"spe",
54+
cl::desc(
55+
"Enable Arm SPE mode. Used in conjuction with no-lbr mode, ie `--spe "
56+
"--nl`"),
57+
cl::cat(AggregatorCategory));
58+
5259
static cl::opt<std::string>
5360
ITraceAggregation("itrace",
5461
cl::desc("Generate LBR info with perf itrace argument"),
@@ -181,11 +188,19 @@ void DataAggregator::start() {
181188

182189
findPerfExecutable();
183190

184-
if (opts::BasicAggregation) {
185-
launchPerfProcess("events without LBR",
186-
MainEventsPPI,
191+
if (opts::ArmSPE) {
192+
if (!opts::BasicAggregation) {
193+
errs() << "PERF2BOLT-ERROR: Arm SPE mode is combined only with "
194+
"BasicAggregation.\n";
195+
exit(1);
196+
}
197+
launchPerfProcess("branch events with SPE", MainEventsPPI,
198+
"script -F pid,event,ip,addr --itrace=i1i",
199+
/*Wait = */ false);
200+
} else if (opts::BasicAggregation) {
201+
launchPerfProcess("events without LBR", MainEventsPPI,
187202
"script -F pid,event,ip",
188-
/*Wait = */false);
203+
/*Wait = */ false);
189204
} else if (!opts::ITraceAggregation.empty()) {
190205
// Disable parsing memory profile from trace data, unless requested by user.
191206
if (!opts::ParseMemProfile.getNumOccurrences())
@@ -456,14 +471,20 @@ int DataAggregator::prepareToParse(StringRef Name, PerfProcessInfo &Process,
456471
Error DataAggregator::preprocessProfile(BinaryContext &BC) {
457472
this->BC = &BC;
458473

459-
auto ErrorCallback = [](int ReturnCode, StringRef ErrBuf) {
474+
const Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
475+
"Cannot print 'addr' field.");
476+
477+
auto ErrorCallback = [&NoData](int ReturnCode, StringRef ErrBuf) {
478+
if (opts::ArmSPE && NoData.match(ErrBuf)) {
479+
errs() << "PERF2BOLT-ERROR: perf data are incompatible for Arm SPE mode "
480+
"consumption. ADDR attribute is unset.\n";
481+
exit(1);
482+
}
460483
errs() << "PERF-ERROR: return code " << ReturnCode << "\n" << ErrBuf;
461484
exit(1);
462485
};
463486

464487
auto MemEventsErrorCallback = [&](int ReturnCode, StringRef ErrBuf) {
465-
Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
466-
"Cannot print 'addr' field.");
467488
if (!NoData.match(ErrBuf))
468489
ErrorCallback(ReturnCode, ErrBuf);
469490
};
@@ -509,7 +530,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
509530
filterBinaryMMapInfo();
510531
prepareToParse("events", MainEventsPPI, ErrorCallback);
511532

512-
if ((!opts::BasicAggregation && parseBranchEvents()) ||
533+
if (((!opts::BasicAggregation && !opts::ArmSPE) && parseBranchEvents()) ||
534+
(opts::BasicAggregation && opts::ArmSPE && parseSpeAsBasicEvents()) ||
513535
(opts::BasicAggregation && parseBasicEvents()))
514536
errs() << "PERF2BOLT: failed to parse samples\n";
515537

@@ -1120,6 +1142,66 @@ ErrorOr<DataAggregator::PerfBasicSample> DataAggregator::parseBasicSample() {
11201142
return PerfBasicSample{Event.get(), Address};
11211143
}
11221144

1145+
ErrorOr<
1146+
std::pair<DataAggregator::PerfBasicSample, DataAggregator::PerfBasicSample>>
1147+
DataAggregator::parseSpeAsBasicSamples() {
1148+
while (checkAndConsumeFS()) {
1149+
}
1150+
1151+
ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
1152+
if (std::error_code EC = PIDRes.getError())
1153+
return EC;
1154+
1155+
constexpr PerfBasicSample EmptySample = PerfBasicSample{StringRef(), 0};
1156+
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
1157+
if (MMapInfoIter == BinaryMMapInfo.end()) {
1158+
consumeRestOfLine();
1159+
return std::make_pair(EmptySample, EmptySample);
1160+
}
1161+
1162+
while (checkAndConsumeFS()) {
1163+
}
1164+
1165+
ErrorOr<StringRef> Event = parseString(FieldSeparator);
1166+
if (std::error_code EC = Event.getError())
1167+
return EC;
1168+
1169+
while (checkAndConsumeFS()) {
1170+
}
1171+
1172+
ErrorOr<uint64_t> AddrResTo = parseHexField(FieldSeparator);
1173+
if (std::error_code EC = AddrResTo.getError())
1174+
return EC;
1175+
consumeAllRemainingFS();
1176+
1177+
ErrorOr<uint64_t> AddrResFrom = parseHexField(FieldSeparator, true);
1178+
if (std::error_code EC = AddrResFrom.getError())
1179+
return EC;
1180+
1181+
if (!checkAndConsumeNewLine()) {
1182+
reportError("expected end of line");
1183+
return make_error_code(llvm::errc::io_error);
1184+
}
1185+
1186+
auto genBasicSample = [&](uint64_t Address) {
1187+
// When fed with non SPE branch events the target address will be null.
1188+
// This is expected and ignored.
1189+
if (Address == 0x0)
1190+
return EmptySample;
1191+
1192+
if (!BC->HasFixedLoadAddress)
1193+
adjustAddress(Address, MMapInfoIter->second);
1194+
return PerfBasicSample{Event.get(), Address};
1195+
};
1196+
1197+
// Show more meaningful event names on boltdata.
1198+
if (Event->str() == "instructions:")
1199+
Event = *AddrResTo != 0x0 ? "branch-spe:" : "instruction-spe:";
1200+
1201+
return std::make_pair(genBasicSample(*AddrResFrom),
1202+
genBasicSample(*AddrResTo));
1203+
}
1204+
11231205
ErrorOr<DataAggregator::PerfMemSample> DataAggregator::parseMemSample() {
11241206
PerfMemSample Res{0, 0};
11251207

@@ -1601,6 +1683,46 @@ std::error_code DataAggregator::parseBasicEvents() {
16011683
return std::error_code();
16021684
}
16031685

1686+
std::error_code DataAggregator::parseSpeAsBasicEvents() {
1687+
outs() << "PERF2BOLT: parsing SPE data as basic events (no LBR)...\n";
1688+
NamedRegionTimer T("parseSPEBasic", "Parsing SPE as basic events",
1689+
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
1690+
uint64_t NumSpeBranchSamples = 0;
1691+
1692+
// Convert entries to one or two basic samples, depending on whether there is
1693+
// branch target information.
1694+
while (hasData()) {
1695+
auto SamplePair = parseSpeAsBasicSamples();
1696+
if (std::error_code EC = SamplePair.getError())
1697+
return EC;
1698+
1699+
auto registerSample = [this](const PerfBasicSample *Sample) {
1700+
if (!Sample->PC)
1701+
return;
1702+
1703+
if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
1704+
BF->setHasProfileAvailable();
1705+
1706+
++BasicSamples[Sample->PC];
1707+
EventNames.insert(Sample->EventName);
1708+
};
1709+
1710+
if (SamplePair->first.PC != 0x0 && SamplePair->second.PC != 0x0)
1711+
++NumSpeBranchSamples;
1712+
1713+
registerSample(&SamplePair->first);
1714+
registerSample(&SamplePair->second);
1715+
}
1716+
1717+
if (NumSpeBranchSamples == 0)
1718+
errs() << "PERF2BOLT-WARNING: no SPE branches found\n";
1719+
else
1720+
outs() << "PERF2BOLT: found " << NumSpeBranchSamples
1721+
<< " SPE branch sample pairs.\n";
1722+
1723+
return std::error_code();
1724+
}
1725+
16041726
void DataAggregator::processBasicEvents() {
16051727
outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
16061728
NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName,
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
## Check that Arm SPE mode is available on AArch64 with BasicAggregation.
2+
3+
REQUIRES: system-linux,perf,target=aarch64{{.*}}
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR
8+
9+
CHECK-SPE-NO-LBR: PERF2BOLT: Starting data aggregation job
10+
11+
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe
12+
RUN: not perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-LBR
13+
14+
CHECK-SPE-LBR: PERF2BOLT-ERROR: Arm SPE mode is combined only with BasicAggregation.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
## Check that Arm SPE mode is unavailable on X86.
2+
3+
REQUIRES: system-linux,x86_64-linux
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s
8+
9+
CHECK: BOLT-ERROR: -spe is available only on AArch64.

bolt/tools/driver/llvm-bolt.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ static cl::opt<std::string> InputFilename(cl::Positional,
5151
cl::Required, cl::cat(BoltCategory),
5252
cl::sub(cl::SubCommand::getAll()));
5353

54+
extern cl::opt<bool> ArmSPE;
55+
5456
static cl::opt<std::string>
5557
InputDataFilename("data",
5658
cl::desc("<data file>"),
@@ -237,6 +239,13 @@ int main(int argc, char **argv) {
237239
if (Error E = RIOrErr.takeError())
238240
report_error(opts::InputFilename, std::move(E));
239241
RewriteInstance &RI = *RIOrErr.get();
242+
243+
if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
244+
opts::ArmSPE == 1) {
245+
errs() << "BOLT-ERROR: -spe is available only on AArch64.\n";
246+
exit(1);
247+
}
248+
240249
if (!opts::PerfData.empty()) {
241250
if (!opts::AggregateOnly) {
242251
errs() << ToolName
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
1+
set(LLVM_LINK_COMPONENTS
2+
DebugInfoDWARF
3+
Object
4+
${LLVM_TARGETS_TO_BUILD}
5+
)
6+
17
add_bolt_unittest(ProfileTests
28
DataAggregator.cpp
9+
PerfSpeEvents.cpp
310

411
DISABLE_LLVM_LINK_LLVM_DYLIB
512
)
613

714
target_link_libraries(ProfileTests
815
PRIVATE
16+
LLVMBOLTCore
917
LLVMBOLTProfile
18+
LLVMTargetParser
19+
LLVMTestingSupport
1020
)
1121

22+
foreach (tgt ${BOLT_TARGETS_TO_BUILD})
23+
string(TOUPPER "${tgt}" upper)
24+
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
25+
endforeach()

0 commit comments

Comments
 (0)