Skip to content

Commit dbd86f1

Browse files
[BOLT][AArch64] Introduce SPE mode in BasicAggregation
BOLT gains the ability to process branch target information generated by Arm SPE data, using the `BasicAggregation` format. Example usage is: ```bash perf2bolt -p perf.data -o perf.boltdata --nl --spe BINARY ``` New branch data and compatibility: --- SPE branch entries in perf data contain a branch pair (`IP` -> `ADDR`) for the source and destination branches. DataAggregator processes those by creating two basic samples. Any other event types will have `ADDR` field set to `0x0`. For those a single sample will be created. Such events can be either SPE or non-SPE, like `l1d-access` and `cycles` respectively. The format of the input perf entries is: ``` PID EVENT-TYPE ADDR IP ``` When on SPE mode and: - host is not `AArch64`, BOLT will exit with a relevant message - `ADDR` field is unavailable, BOLT will exit with a relevant message - no branch pairs were recorded, BOLT will present a warning Examples of generating profiling data for the SPE mode: --- Profiles can be captured with perf on AArch64 machines with SPE enabled. They can be combined with other events, SPE or not. Capture only SPE branch data events: ```bash perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY ``` Capture any SPE events: ```bash perf record -e 'arm_spe_0//u' -- BINARY ``` Capture any SPE events and cycles ```bash perf record -e 'arm_spe_0//u' -e cycles:u -- BINARY ``` More filters, jitter, and specify count to control overheads/quality. ```bash perf record -e 'arm_spe_0/branch_filter=1,load_filter=0,store_filter=0,jitter=1/u' -c 10007 -- BINARY ```
1 parent 0745add commit dbd86f1

File tree

7 files changed

+365
-11
lines changed

7 files changed

+365
-11
lines changed

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ class DataAggregator : public DataReader {
7878
static bool checkPerfDataMagic(StringRef FileName);
7979

8080
private:
81+
friend struct PerfSpeEventsTestHelper;
82+
8183
struct PerfBranchSample {
8284
SmallVector<LBREntry, 32> LBR;
8385
uint64_t PC;
@@ -294,6 +296,15 @@ class DataAggregator : public DataReader {
294296
/// and a PC
295297
ErrorOr<PerfBasicSample> parseBasicSample();
296298

299+
/// Parse an Arm SPE entry into the non-lbr format by generating two basic
300+
/// samples. The format of an input SPE entry is:
301+
/// ```
302+
/// PID EVENT-TYPE ADDR IP
303+
/// ```
304+
/// SPE branch events will have 'ADDR' set to a branch target address while
305+
/// other perf or SPE events will have it set to zero.
306+
ErrorOr<std::pair<PerfBasicSample,PerfBasicSample>> parseSpeAsBasicSamples();
307+
297308
/// Parse a single perf sample containing a PID associated with an IP and
298309
/// address.
299310
ErrorOr<PerfMemSample> parseMemSample();
@@ -343,6 +354,9 @@ class DataAggregator : public DataReader {
343354
/// Process non-LBR events.
344355
void processBasicEvents();
345356

357+
/// Parse Arm SPE events into the non-LBR format.
358+
std::error_code parseSpeAsBasicEvents();
359+
346360
/// Parse the full output generated by perf script to report memory events.
347361
std::error_code parseMemEvents();
348362

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 132 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ static cl::opt<bool>
4949
cl::desc("aggregate basic samples (without LBR info)"),
5050
cl::cat(AggregatorCategory));
5151

52+
cl::opt<bool> ArmSPE(
53+
"spe",
54+
cl::desc(
55+
"Enable Arm SPE mode. Used in conjuction with no-lbr mode, ie `--spe "
56+
"--nl`"),
57+
cl::cat(AggregatorCategory));
58+
5259
static cl::opt<std::string>
5360
ITraceAggregation("itrace",
5461
cl::desc("Generate LBR info with perf itrace argument"),
@@ -180,22 +187,29 @@ void DataAggregator::start() {
180187

181188
findPerfExecutable();
182189

183-
if (opts::BasicAggregation) {
184-
launchPerfProcess("events without LBR",
185-
MainEventsPPI,
190+
if (opts::ArmSPE) {
191+
if (!opts::BasicAggregation) {
192+
errs() << "PERF2BOLT-ERROR: Arm SPE mode is combined only with "
193+
"BasicAggregation.\n";
194+
exit(1);
195+
}
196+
launchPerfProcess("branch events with SPE", MainEventsPPI,
197+
"script -F pid,event,ip,addr --itrace=i1i",
198+
/*Wait = */ false);
199+
} else if (opts::BasicAggregation) {
200+
launchPerfProcess("events without LBR", MainEventsPPI,
186201
"script -F pid,event,ip",
187-
/*Wait = */false);
202+
/*Wait = */ false);
188203
} else if (!opts::ITraceAggregation.empty()) {
189204
std::string ItracePerfScriptArgs = llvm::formatv(
190205
"script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation);
191206
launchPerfProcess("branch events with itrace", MainEventsPPI,
192207
ItracePerfScriptArgs.c_str(),
193208
/*Wait = */ false);
194209
} else {
195-
launchPerfProcess("branch events",
196-
MainEventsPPI,
210+
launchPerfProcess("branch events", MainEventsPPI,
197211
"script -F pid,ip,brstack",
198-
/*Wait = */false);
212+
/*Wait = */ false);
199213
}
200214

201215
// Note: we launch script for mem events regardless of the option, as the
@@ -531,14 +545,20 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
531545
"not read one from input binary\n";
532546
}
533547

534-
auto ErrorCallback = [](int ReturnCode, StringRef ErrBuf) {
548+
const Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
549+
"Cannot print 'addr' field.");
550+
551+
auto ErrorCallback = [&NoData](int ReturnCode, StringRef ErrBuf) {
552+
if (opts::ArmSPE && NoData.match(ErrBuf)) {
553+
errs() << "PERF2BOLT-ERROR: perf data are incompatible for Arm SPE mode "
554+
"consumption. ADDR attribute is unset.\n";
555+
exit(1);
556+
}
535557
errs() << "PERF-ERROR: return code " << ReturnCode << "\n" << ErrBuf;
536558
exit(1);
537559
};
538560

539561
auto MemEventsErrorCallback = [&](int ReturnCode, StringRef ErrBuf) {
540-
Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
541-
"Cannot print 'addr' field.");
542562
if (!NoData.match(ErrBuf))
543563
ErrorCallback(ReturnCode, ErrBuf);
544564
};
@@ -579,7 +599,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
579599
exit(0);
580600
}
581601

582-
if ((!opts::BasicAggregation && parseBranchEvents()) ||
602+
if (((!opts::BasicAggregation && !opts::ArmSPE) && parseBranchEvents()) ||
603+
(opts::BasicAggregation && opts::ArmSPE && parseSpeAsBasicEvents()) ||
583604
(opts::BasicAggregation && parseBasicEvents()))
584605
errs() << "PERF2BOLT: failed to parse samples\n";
585606

@@ -1226,6 +1247,66 @@ ErrorOr<DataAggregator::PerfBasicSample> DataAggregator::parseBasicSample() {
12261247
return PerfBasicSample{Event.get(), Address};
12271248
}
12281249

1250+
ErrorOr<
1251+
std::pair<DataAggregator::PerfBasicSample, DataAggregator::PerfBasicSample>>
1252+
DataAggregator::parseSpeAsBasicSamples() {
1253+
while (checkAndConsumeFS()) {
1254+
}
1255+
1256+
ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
1257+
if (std::error_code EC = PIDRes.getError())
1258+
return EC;
1259+
1260+
constexpr PerfBasicSample EmptySample = PerfBasicSample{StringRef(), 0};
1261+
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
1262+
if (MMapInfoIter == BinaryMMapInfo.end()) {
1263+
consumeRestOfLine();
1264+
return std::make_pair(EmptySample, EmptySample);
1265+
}
1266+
1267+
while (checkAndConsumeFS()) {
1268+
}
1269+
1270+
ErrorOr<StringRef> Event = parseString(FieldSeparator);
1271+
if (std::error_code EC = Event.getError())
1272+
return EC;
1273+
1274+
while (checkAndConsumeFS()) {
1275+
}
1276+
1277+
ErrorOr<uint64_t> AddrResTo = parseHexField(FieldSeparator);
1278+
if (std::error_code EC = AddrResTo.getError())
1279+
return EC;
1280+
consumeAllRemainingFS();
1281+
1282+
ErrorOr<uint64_t> AddrResFrom = parseHexField(FieldSeparator, true);
1283+
if (std::error_code EC = AddrResFrom.getError())
1284+
return EC;
1285+
1286+
if (!checkAndConsumeNewLine()) {
1287+
reportError("expected end of line");
1288+
return make_error_code(llvm::errc::io_error);
1289+
}
1290+
1291+
auto genBasicSample = [&](uint64_t Address) {
1292+
// When fed with non SPE branch events the target address will be null.
1293+
// This is expected and ignored.
1294+
if (Address == 0x0)
1295+
return EmptySample;
1296+
1297+
if (!BC->HasFixedLoadAddress)
1298+
adjustAddress(Address, MMapInfoIter->second);
1299+
return PerfBasicSample{Event.get(), Address};
1300+
};
1301+
1302+
// Show more meaningful event names on boltdata.
1303+
if (Event->str() == "instructions:")
1304+
Event = *AddrResTo != 0x0 ? "branch-spe:" : "instruction-spe:";
1305+
1306+
return std::make_pair(genBasicSample(*AddrResFrom),
1307+
genBasicSample(*AddrResTo));
1308+
}
1309+
12291310
ErrorOr<DataAggregator::PerfMemSample> DataAggregator::parseMemSample() {
12301311
PerfMemSample Res{0, 0};
12311312

@@ -1703,6 +1784,46 @@ std::error_code DataAggregator::parseBasicEvents() {
17031784
return std::error_code();
17041785
}
17051786

1787+
std::error_code DataAggregator::parseSpeAsBasicEvents() {
1788+
outs() << "PERF2BOLT: parsing SPE data as basic events (no LBR)...\n";
1789+
NamedRegionTimer T("parseSPEBasic", "Parsing SPE as basic events",
1790+
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
1791+
uint64_t NumSpeBranchSamples = 0;
1792+
1793+
// Convert entries to one or two basic samples, depending on whether there is
1794+
// branch target information.
1795+
while (hasData()) {
1796+
auto SamplePair = parseSpeAsBasicSamples();
1797+
if (std::error_code EC = SamplePair.getError())
1798+
return EC;
1799+
1800+
auto registerSample = [this](const PerfBasicSample *Sample) {
1801+
if (!Sample->PC)
1802+
return;
1803+
1804+
if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
1805+
BF->setHasProfileAvailable();
1806+
1807+
++BasicSamples[Sample->PC];
1808+
EventNames.insert(Sample->EventName);
1809+
};
1810+
1811+
if (SamplePair->first.PC != 0x0 && SamplePair->second.PC != 0x0)
1812+
++NumSpeBranchSamples;
1813+
1814+
registerSample(&SamplePair->first);
1815+
registerSample(&SamplePair->second);
1816+
}
1817+
1818+
if (NumSpeBranchSamples == 0)
1819+
errs() << "PERF2BOLT-WARNING: no SPE branches found\n";
1820+
else
1821+
outs() << "PERF2BOLT: found " << NumSpeBranchSamples
1822+
<< " SPE branch sample pairs.\n";
1823+
1824+
return std::error_code();
1825+
}
1826+
17061827
void DataAggregator::processBasicEvents() {
17071828
outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
17081829
NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName,
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
## Check that Arm SPE mode is available on AArch64 with BasicAggregation.
2+
3+
REQUIRES: system-linux,perf,target=aarch64{{.*}}
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR
8+
9+
CHECK-SPE-NO-LBR: PERF2BOLT: Starting data aggregation job
10+
11+
RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe
12+
RUN: not perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-LBR
13+
14+
CHECK-SPE-LBR: PERF2BOLT-ERROR: Arm SPE mode is combined only with BasicAggregation.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
## Check that Arm SPE mode is unavailable on X86.
2+
3+
REQUIRES: system-linux,x86_64-linux
4+
5+
RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
6+
RUN: touch %t.empty.perf.data
7+
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s
8+
9+
CHECK: BOLT-ERROR: -spe is available only on AArch64.

bolt/tools/driver/llvm-bolt.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ static cl::opt<std::string> InputFilename(cl::Positional,
5151
cl::Required, cl::cat(BoltCategory),
5252
cl::sub(cl::SubCommand::getAll()));
5353

54+
extern cl::opt<bool> ArmSPE;
55+
5456
static cl::opt<std::string>
5557
InputDataFilename("data",
5658
cl::desc("<data file>"),
@@ -245,6 +247,13 @@ int main(int argc, char **argv) {
245247
if (Error E = RIOrErr.takeError())
246248
report_error(opts::InputFilename, std::move(E));
247249
RewriteInstance &RI = *RIOrErr.get();
250+
251+
if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
252+
opts::ArmSPE == 1) {
253+
errs() << "BOLT-ERROR: -spe is available only on AArch64.\n";
254+
exit(1);
255+
}
256+
248257
if (!opts::PerfData.empty()) {
249258
if (!opts::AggregateOnly) {
250259
errs() << ToolName
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
1+
set(LLVM_LINK_COMPONENTS
2+
DebugInfoDWARF
3+
Object
4+
${LLVM_TARGETS_TO_BUILD}
5+
)
6+
17
add_bolt_unittest(ProfileTests
28
DataAggregator.cpp
9+
PerfSpeEvents.cpp
310

411
DISABLE_LLVM_LINK_LLVM_DYLIB
512
)
613

714
target_link_libraries(ProfileTests
815
PRIVATE
16+
LLVMBOLTCore
917
LLVMBOLTProfile
18+
LLVMTargetParser
19+
LLVMTestingSupport
1020
)
1121

22+
foreach (tgt ${BOLT_TARGETS_TO_BUILD})
23+
string(TOUPPER "${tgt}" upper)
24+
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
25+
endforeach()

0 commit comments

Comments
 (0)