Skip to content

Commit 4c68b93

Browse files
[BOLT][AArch64] Introduce SPE mode in BasicAggregation
BOLT gains the ability to process branch target information generated by Arm SPE data, using the `BasicAggregation` format. Example usage is: ```bash perf2bolt -p perf.data -o perf.boltdata --nl --spe BINARY ``` New branch data and compatibility: --- SPE branch entries in perf data contain a branch pair (`IP` -> `ADDR`) for the source and destination branches. DataAggregator processes those by creating two basic samples. Any other event types will have `ADDR` field set to `0x0`. For those a single sample will be created. Such events can be either SPE or non-SPE, like `l1d-access` and `cycles` respectively. The format of the input perf entries is: ``` PID EVENT-TYPE ADDR IP ``` When on SPE mode and: - host is not `AArch64`, BOLT will exit with a relevant message - `ADDR` field is unavailable, BOLT will exit with a relevant message - no branch pairs were recorded, BOLT will present a warning Examples of generating profiling data for the SPE mode: --- Profiles can be captured with perf on AArch64 machines with SPE enabled. They can be combined with other events, SPE or not. Capture only SPE branch data events: ```bash perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY ``` Capture any SPE events: ```bash perf record -e 'arm_spe_0//u' -- BINARY ``` Capture any SPE events and cycles ```bash perf record -e 'arm_spe_0//u' -e cycles:u -- BINARY ``` More filters, jitter, and specify count to control overheads/quality. ```bash perf record -e 'arm_spe_0/branch_filter=1,load_filter=0,store_filter=0,jitter=1/u' -c 10007 -- BINARY ```
1 parent c6a907a commit 4c68b93

File tree

7 files changed

+363
-8
lines changed

7 files changed

+363
-8
lines changed

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ class DataAggregator : public DataReader {
7878
static bool checkPerfDataMagic(StringRef FileName);
7979

8080
private:
81+
friend struct PerfSpeEventsTestHelper;
82+
8183
struct PerfBranchSample {
8284
SmallVector<LBREntry, 32> LBR;
8385
};
@@ -296,6 +298,15 @@ class DataAggregator : public DataReader {
296298
/// and a PC
297299
ErrorOr<PerfBasicSample> parseBasicSample();
298300

301+
/// Parse an Arm SPE entry into the non-lbr format by generating two basic
302+
/// samples. The format of an input SPE entry is:
303+
/// ```
304+
/// PID EVENT-TYPE ADDR IP
305+
/// ```
306+
/// SPE branch events will have 'ADDR' set to a branch target address while
307+
/// other perf or SPE events will have it set to zero.
308+
ErrorOr<std::pair<PerfBasicSample,PerfBasicSample>> parseSpeAsBasicSamples();
309+
299310
/// Parse a single perf sample containing a PID associated with an IP and
300311
/// address.
301312
ErrorOr<PerfMemSample> parseMemSample();
@@ -342,6 +353,9 @@ class DataAggregator : public DataReader {
342353
/// Process non-LBR events.
343354
void processBasicEvents();
344355

356+
/// Parse Arm SPE events into the non-LBR format.
357+
std::error_code parseSpeAsBasicEvents();
358+
345359
/// Parse the full output generated by perf script to report memory events.
346360
std::error_code parseMemEvents();
347361

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 130 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ static cl::opt<bool>
4949
cl::desc("aggregate basic samples (without LBR info)"),
5050
cl::cat(AggregatorCategory));
5151

52+
cl::opt<bool> ArmSPE(
53+
"spe",
54+
cl::desc(
55+
"Enable Arm SPE mode. Used in conjuction with no-lbr mode, ie `--spe "
56+
"--nl`"),
57+
cl::cat(AggregatorCategory));
58+
5259
static cl::opt<std::string>
5360
ITraceAggregation("itrace",
5461
cl::desc("Generate LBR info with perf itrace argument"),
@@ -171,11 +178,19 @@ void DataAggregator::start() {
171178

172179
findPerfExecutable();
173180

174-
if (opts::BasicAggregation) {
175-
launchPerfProcess("events without LBR",
176-
MainEventsPPI,
181+
if (opts::ArmSPE) {
182+
if (!opts::BasicAggregation) {
183+
errs() << "PERF2BOLT-ERROR: Arm SPE mode is combined only with "
184+
"BasicAggregation.\n";
185+
exit(1);
186+
}
187+
launchPerfProcess("branch events with SPE", MainEventsPPI,
188+
"script -F pid,event,ip,addr --itrace=i1i",
189+
/*Wait = */ false);
190+
} else if (opts::BasicAggregation) {
191+
launchPerfProcess("events without LBR", MainEventsPPI,
177192
"script -F pid,event,ip",
178-
/*Wait = */false);
193+
/*Wait = */ false);
179194
} else if (!opts::ITraceAggregation.empty()) {
180195
std::string ItracePerfScriptArgs = llvm::formatv(
181196
"script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
@@ -459,14 +474,20 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
459474
"not read one from input binary\n";
460475
}
461476

462-
auto ErrorCallback = [](int ReturnCode, StringRef ErrBuf) {
477+
const Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
478+
"Cannot print 'addr' field.");
479+
480+
auto ErrorCallback = [&NoData](int ReturnCode, StringRef ErrBuf) {
481+
if (opts::ArmSPE && NoData.match(ErrBuf)) {
482+
errs() << "PERF2BOLT-ERROR: perf data are incompatible for Arm SPE mode "
483+
"consumption. ADDR attribute is unset.\n";
484+
exit(1);
485+
}
463486
errs() << "PERF-ERROR: return code " << ReturnCode << "\n" << ErrBuf;
464487
exit(1);
465488
};
466489

467490
auto MemEventsErrorCallback = [&](int ReturnCode, StringRef ErrBuf) {
468-
Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
469-
"Cannot print 'addr' field.");
470491
if (!NoData.match(ErrBuf))
471492
ErrorCallback(ReturnCode, ErrBuf);
472493
};
@@ -507,7 +528,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
507528
exit(0);
508529
}
509530

510-
if ((!opts::BasicAggregation && parseBranchEvents()) ||
531+
if (((!opts::BasicAggregation && !opts::ArmSPE) && parseBranchEvents()) ||
532+
(opts::BasicAggregation && opts::ArmSPE && parseSpeAsBasicEvents()) ||
511533
(opts::BasicAggregation && parseBasicEvents()))
512534
errs() << "PERF2BOLT: failed to parse samples\n";
513535

@@ -1138,6 +1160,66 @@ ErrorOr<DataAggregator::PerfBasicSample> DataAggregator::parseBasicSample() {
11381160
return PerfBasicSample{Event.get(), Address};
11391161
}
11401162

1163+
ErrorOr<
1164+
std::pair<DataAggregator::PerfBasicSample, DataAggregator::PerfBasicSample>>
1165+
DataAggregator::parseSpeAsBasicSamples() {
1166+
while (checkAndConsumeFS()) {
1167+
}
1168+
1169+
ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
1170+
if (std::error_code EC = PIDRes.getError())
1171+
return EC;
1172+
1173+
constexpr PerfBasicSample EmptySample = PerfBasicSample{StringRef(), 0};
1174+
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
1175+
if (MMapInfoIter == BinaryMMapInfo.end()) {
1176+
consumeRestOfLine();
1177+
return std::make_pair(EmptySample, EmptySample);
1178+
}
1179+
1180+
while (checkAndConsumeFS()) {
1181+
}
1182+
1183+
ErrorOr<StringRef> Event = parseString(FieldSeparator);
1184+
if (std::error_code EC = Event.getError())
1185+
return EC;
1186+
1187+
while (checkAndConsumeFS()) {
1188+
}
1189+
1190+
ErrorOr<uint64_t> AddrResTo = parseHexField(FieldSeparator);
1191+
if (std::error_code EC = AddrResTo.getError())
1192+
return EC;
1193+
consumeAllRemainingFS();
1194+
1195+
ErrorOr<uint64_t> AddrResFrom = parseHexField(FieldSeparator, true);
1196+
if (std::error_code EC = AddrResFrom.getError())
1197+
return EC;
1198+
1199+
if (!checkAndConsumeNewLine()) {
1200+
reportError("expected end of line");
1201+
return make_error_code(llvm::errc::io_error);
1202+
}
1203+
1204+
auto genBasicSample = [&](uint64_t Address) {
1205+
// When fed with non SPE branch events the target address will be null.
1206+
// This is expected and ignored.
1207+
if (Address == 0x0)
1208+
return EmptySample;
1209+
1210+
if (!BC->HasFixedLoadAddress)
1211+
adjustAddress(Address, MMapInfoIter->second);
1212+
return PerfBasicSample{Event.get(), Address};
1213+
};
1214+
1215+
// Show more meaningful event names on boltdata.
1216+
if (Event->str() == "instructions:")
1217+
Event = *AddrResTo != 0x0 ? "branch-spe:" : "instruction-spe:";
1218+
1219+
return std::make_pair(genBasicSample(*AddrResFrom),
1220+
genBasicSample(*AddrResTo));
1221+
}
1222+
11411223
ErrorOr<DataAggregator::PerfMemSample> DataAggregator::parseMemSample() {
11421224
PerfMemSample Res{0, 0};
11431225

@@ -1643,6 +1725,46 @@ std::error_code DataAggregator::parseBasicEvents() {
16431725
return std::error_code();
16441726
}
16451727

1728+
std::error_code DataAggregator::parseSpeAsBasicEvents() {
1729+
outs() << "PERF2BOLT: parsing SPE data as basic events (no LBR)...\n";
1730+
NamedRegionTimer T("parseSPEBasic", "Parsing SPE as basic events",
1731+
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
1732+
uint64_t NumSpeBranchSamples = 0;
1733+
1734+
// Convert entries to one or two basic samples, depending on whether there is
1735+
// branch target information.
1736+
while (hasData()) {
1737+
auto SamplePair = parseSpeAsBasicSamples();
1738+
if (std::error_code EC = SamplePair.getError())
1739+
return EC;
1740+
1741+
auto registerSample = [this](const PerfBasicSample *Sample) {
1742+
if (!Sample->PC)
1743+
return;
1744+
1745+
if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
1746+
BF->setHasProfileAvailable();
1747+
1748+
++BasicSamples[Sample->PC];
1749+
EventNames.insert(Sample->EventName);
1750+
};
1751+
1752+
if (SamplePair->first.PC != 0x0 && SamplePair->second.PC != 0x0)
1753+
++NumSpeBranchSamples;
1754+
1755+
registerSample(&SamplePair->first);
1756+
registerSample(&SamplePair->second);
1757+
}
1758+
1759+
if (NumSpeBranchSamples == 0)
1760+
errs() << "PERF2BOLT-WARNING: no SPE branches found\n";
1761+
else
1762+
outs() << "PERF2BOLT: found " << NumSpeBranchSamples
1763+
<< " SPE branch sample pairs.\n";
1764+
1765+
return std::error_code();
1766+
}
1767+
16461768
void DataAggregator::processBasicEvents() {
16471769
outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
16481770
NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName,
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
## Check that Arm SPE mode is available on AArch64 with BasicAggregation.
2+
3+
REQUIRES: system-linux,perf,target=aarch64{{.*}}
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR
8+
9+
CHECK-SPE-NO-LBR: PERF2BOLT: Starting data aggregation job
10+
11+
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe
12+
RUN: not perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-LBR
13+
14+
CHECK-SPE-LBR: PERF2BOLT-ERROR: Arm SPE mode is combined only with BasicAggregation.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
## Check that Arm SPE mode is unavailable on X86.
2+
3+
REQUIRES: system-linux,x86_64-linux
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s
8+
9+
CHECK: BOLT-ERROR: -spe is available only on AArch64.

bolt/tools/driver/llvm-bolt.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ static cl::opt<std::string> InputFilename(cl::Positional,
5151
cl::Required, cl::cat(BoltCategory),
5252
cl::sub(cl::SubCommand::getAll()));
5353

54+
extern cl::opt<bool> ArmSPE;
55+
5456
static cl::opt<std::string>
5557
InputDataFilename("data",
5658
cl::desc("<data file>"),
@@ -237,6 +239,13 @@ int main(int argc, char **argv) {
237239
if (Error E = RIOrErr.takeError())
238240
report_error(opts::InputFilename, std::move(E));
239241
RewriteInstance &RI = *RIOrErr.get();
242+
243+
if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
244+
opts::ArmSPE == 1) {
245+
errs() << "BOLT-ERROR: -spe is available only on AArch64.\n";
246+
exit(1);
247+
}
248+
240249
if (!opts::PerfData.empty()) {
241250
if (!opts::AggregateOnly) {
242251
errs() << ToolName
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
1+
set(LLVM_LINK_COMPONENTS
2+
DebugInfoDWARF
3+
Object
4+
${LLVM_TARGETS_TO_BUILD}
5+
)
6+
17
add_bolt_unittest(ProfileTests
28
DataAggregator.cpp
9+
PerfSpeEvents.cpp
310

411
DISABLE_LLVM_LINK_LLVM_DYLIB
512
)
613

714
target_link_libraries(ProfileTests
815
PRIVATE
16+
LLVMBOLTCore
917
LLVMBOLTProfile
18+
LLVMTargetParser
19+
LLVMTestingSupport
1020
)
1121

22+
foreach (tgt ${BOLT_TARGETS_TO_BUILD})
23+
string(TOUPPER "${tgt}" upper)
24+
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
25+
endforeach()

0 commit comments

Comments
 (0)