From c846650dab434c3eb570bbc4cb5773833f5325d2 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Sat, 7 Jun 2025 15:58:09 -0700 Subject: [PATCH 1/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?= =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.4 [skip ci] --- bolt/include/bolt/Core/BinaryFunction.h | 12 ++++ bolt/include/bolt/Profile/DataAggregator.h | 14 +++- bolt/include/bolt/Profile/DataReader.h | 15 +---- .../include/bolt/Profile/ProfileYAMLMapping.h | 2 + bolt/lib/Core/BinaryFunction.cpp | 2 + bolt/lib/Passes/ProfileQualityStats.cpp | 3 + bolt/lib/Profile/BoltAddressTranslation.cpp | 4 +- bolt/lib/Profile/DataAggregator.cpp | 66 +++---------------- bolt/lib/Profile/DataReader.cpp | 6 ++ bolt/lib/Profile/YAMLProfileReader.cpp | 1 + bolt/lib/Profile/YAMLProfileWriter.cpp | 1 + bolt/test/X86/shrinkwrapping.test | 2 + 12 files changed, 56 insertions(+), 72 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 14957cba50174..ca8b786f4ab69 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -388,6 +388,10 @@ class BinaryFunction { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Profile data for the number of times this function was entered from + /// external code (DSO, JIT, etc). + uint64_t ExternEntryCount{0}; + /// Profile match ratio. float ProfileMatchRatio{0.0f}; @@ -1877,6 +1881,10 @@ class BinaryFunction { return *this; } + /// Set the profile data for the number of times the function was entered from + /// external code (DSO/JIT). + void setExternEntryCount(uint64_t Count) { ExternEntryCount = Count; } + /// Adjust execution count for the function by a given \p Count. The value /// \p Count will be subtracted from the current function count. /// @@ -1904,6 +1912,10 @@ class BinaryFunction { /// Return COUNT_NO_PROFILE if there's no profile info. uint64_t getExecutionCount() const { return ExecutionCount; } + /// Return the profile information about the number of times the function was + /// entered from external code (DSO/JIT). + uint64_t getExternEntryCount() const { return ExternEntryCount; } + /// Return the raw profile information about the number of branch /// executions corresponding to this function. uint64_t getRawSampleCount() const { return RawSampleCount; } diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index cb8e81b829a09..3f07a6dc03a4f 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -78,6 +78,13 @@ class DataAggregator : public DataReader { static bool checkPerfDataMagic(StringRef FileName); private: + struct LBREntry { + uint64_t From; + uint64_t To; + bool Mispred; + }; + friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &); + struct PerfBranchSample { SmallVector LBR; }; @@ -476,7 +483,6 @@ class DataAggregator : public DataReader { /// Debugging dump methods void dump() const; - void dump(const LBREntry &LBR) const; void dump(const PerfBranchSample &Sample) const; void dump(const PerfMemSample &Sample) const; @@ -504,6 +510,12 @@ class DataAggregator : public DataReader { friend class YAMLProfileWriter; }; + +inline raw_ostream &operator<<(raw_ostream &OS, + const DataAggregator::LBREntry &L) { + OS << formatv("{0:x} -> {1:x}/{2}", L.From, L.To, L.Mispred ? 'M' : 'P'); + return OS; +} } // namespace bolt } // namespace llvm diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 5df1b5a8f4a00..6f527ba3931d4 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -32,18 +32,6 @@ namespace bolt { class BinaryFunction; -struct LBREntry { - uint64_t From; - uint64_t To; - bool Mispred; -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) { - OS << "0x" << Twine::utohexstr(LBR.From) << " -> 0x" - << Twine::utohexstr(LBR.To); - return OS; -} - struct Location { bool IsSymbol; StringRef Name; @@ -109,6 +97,9 @@ struct FuncBranchData { /// Total execution count for the function. int64_t ExecutionCount{0}; + /// Total entry count from external code for the function. + uint64_t ExternEntryCount{0}; + /// Indicate if the data was used. bool Used{false}; diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index a8d9a15311d94..41e2bd1651efd 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -206,6 +206,7 @@ struct BinaryFunctionProfile { uint32_t Id{0}; llvm::yaml::Hex64 Hash{0}; uint64_t ExecCount{0}; + uint64_t ExternEntryCount{0}; std::vector Blocks; std::vector InlineTree; bool Used{false}; @@ -218,6 +219,7 @@ template <> struct MappingTraits { YamlIO.mapRequired("fid", BFP.Id); YamlIO.mapRequired("hash", BFP.Hash); YamlIO.mapRequired("exec", BFP.ExecCount); + YamlIO.mapOptional("extern", BFP.ExternEntryCount, 0); YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks); YamlIO.mapOptional("blocks", BFP.Blocks, std::vector()); diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 6d1969f5c6c30..b998d7160aae7 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -471,6 +471,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n Sample Count: " << RawSampleCount; OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); } + if (ExternEntryCount) + OS << "\n Extern Entry Count: " << ExternEntryCount; if (opts::PrintDynoStats && !getLayout().block_empty()) { OS << '\n'; diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index dfd74d3dd5719..64cc662c3ab29 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -532,6 +532,9 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { std::vector &MaxCountMap = TotalMaxCountMaps[FunctionNum]; std::vector &MinCountMap = TotalMinCountMaps[FunctionNum]; + // Record external entry count into CallGraphIncomingFlows + CallGraphIncomingFlows[FunctionNum] += Function->getExternEntryCount(); + // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows auto recordCall = [&](const BinaryBasicBlock *SourceBB, const MCSymbol *DestSymbol, uint64_t Count, diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index a253522e4fb15..7ad4e6a2e1411 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -546,7 +546,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress, return Res; for (auto Iter = FromIter; Iter != ToIter;) { - const uint32_t Src = Iter->first; + const uint32_t Src = Iter->second >> 1; if (Iter->second & BRANCHENTRY) { ++Iter; continue; @@ -557,7 +557,7 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress, ++Iter; if (Iter->second & BRANCHENTRY) break; - Res.emplace_back(Src, Iter->first); + Res.emplace_back(Src, Iter->second >> 1); } return Res; diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 2527b5bfe38d2..addff196f4f5b 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -827,13 +827,8 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, << FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To) << " to " << Twine::utohexstr(Second.From) << ".\n"); - for (auto [From, To] : *FTs) { - if (BAT) { - From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true); - To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false); - } + for (auto [From, To] : *FTs) doIntraBranch(*ParentFunc, From, To, Count, false); - } return true; } @@ -972,7 +967,7 @@ bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, return true; } -ErrorOr DataAggregator::parseLBREntry() { +ErrorOr DataAggregator::parseLBREntry() { LBREntry Res; ErrorOr FromStrRes = parseString('/'); if (std::error_code EC = FromStrRes.getError()) @@ -1430,54 +1425,16 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, const uint64_t TraceTo = NextLBR->From; const BinaryFunction *TraceBF = getBinaryFunctionContainingAddress(TraceFrom); - if (opts::HeatmapMode == opts::HeatmapModeKind::HM_Exclusive) { - FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; + FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; + if (TraceBF && TraceBF->containsAddress(LBR.From)) ++Info.InternCount; - } else if (TraceBF && TraceBF->containsAddress(TraceTo)) { - FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)]; - if (TraceBF->containsAddress(LBR.From)) - ++Info.InternCount; - else - ++Info.ExternCount; - } else { - const BinaryFunction *ToFunc = - getBinaryFunctionContainingAddress(TraceTo); - if (TraceBF && ToFunc) { - LLVM_DEBUG({ - dbgs() << "Invalid trace starting in " << TraceBF->getPrintName() - << formatv(" @ {0:x}", TraceFrom - TraceBF->getAddress()) - << formatv(" and ending @ {0:x}\n", TraceTo); - }); - ++NumInvalidTraces; - } else { - LLVM_DEBUG({ - dbgs() << "Out of range trace starting in " - << (TraceBF ? TraceBF->getPrintName() : "None") - << formatv(" @ {0:x}", - TraceFrom - (TraceBF ? TraceBF->getAddress() : 0)) - << " and ending in " - << (ToFunc ? ToFunc->getPrintName() : "None") - << formatv(" @ {0:x}\n", - TraceTo - (ToFunc ? ToFunc->getAddress() : 0)); - }); - ++NumLongRangeTraces; - } - } + else + ++Info.ExternCount; ++NumTraces; } NextLBR = &LBR; - // Record branches outside binary functions for heatmap. - if (opts::HeatmapMode == opts::HeatmapModeKind::HM_Exclusive) { - TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)]; - ++Info.TakenCount; - continue; - } - uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0; - uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0; - if (!From && !To) - continue; - TakenBranchInfo &Info = BranchLBRs[Trace(From, To)]; + TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)]; ++Info.TakenCount; Info.MispredCount += LBR.Mispred; } @@ -2289,6 +2246,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, YamlBF.Id = BF->getFunctionNumber(); YamlBF.Hash = BAT->getBFHash(FuncAddress); YamlBF.ExecCount = BF->getKnownExecutionCount(); + YamlBF.ExternEntryCount = BF->getExternEntryCount(); YamlBF.NumBasicBlocks = BAT->getNumBasicBlocks(FuncAddress); const BoltAddressTranslation::BBHashMapTy &BlockMap = BAT->getBBHashMap(FuncAddress); @@ -2398,16 +2356,10 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, void DataAggregator::dump() const { DataReader::dump(); } -void DataAggregator::dump(const LBREntry &LBR) const { - Diag << "From: " << Twine::utohexstr(LBR.From) - << " To: " << Twine::utohexstr(LBR.To) << " Mispred? " << LBR.Mispred - << "\n"; -} - void DataAggregator::dump(const PerfBranchSample &Sample) const { Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n"; for (const LBREntry &LBR : Sample.LBR) - dump(LBR); + Diag << LBR << '\n'; } void DataAggregator::dump(const PerfMemSample &Sample) const { diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index c512394f26a3b..afe24216d7f5d 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -85,6 +85,7 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { } llvm::stable_sort(Data); ExecutionCount += FBD.ExecutionCount; + ExternEntryCount += FBD.ExternEntryCount; for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) { assert(I->To.Name == FBD.Name); auto NewElmt = EntryData.insert(EntryData.end(), *I); @@ -269,6 +270,7 @@ Error DataReader::preprocessProfile(BinaryContext &BC) { if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) { setBranchData(Function, FuncData); Function.ExecutionCount = FuncData->ExecutionCount; + Function.ExternEntryCount = FuncData->ExternEntryCount; FuncData->Used = true; } } @@ -419,6 +421,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) { if (fetchProfileForOtherEntryPoints(BF)) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); BF.ExecutionCount = FBD->ExecutionCount; + BF.ExternEntryCount = FBD->ExternEntryCount; BF.RawSampleCount = FBD->getNumExecutedBranches(); } return; @@ -449,6 +452,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) { setBranchData(BF, NewBranchData); NewBranchData->Used = true; BF.ExecutionCount = NewBranchData->ExecutionCount; + BF.ExternEntryCount = NewBranchData->ExternEntryCount; BF.ProfileMatchRatio = 1.0f; break; } @@ -1190,6 +1194,8 @@ std::error_code DataReader::parse() { if (BI.To.IsSymbol && BI.To.Offset == 0) { I = GetOrCreateFuncEntry(BI.To.Name); I->second.ExecutionCount += BI.Branches; + if (!BI.From.IsSymbol) + I->second.ExternEntryCount += BI.Branches; } } diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 33ce40ac2eeec..086e47b661e10 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -176,6 +176,7 @@ bool YAMLProfileReader::parseFunctionProfile( uint64_t FunctionExecutionCount = 0; BF.setExecutionCount(YamlBF.ExecCount); + BF.setExternEntryCount(YamlBF.ExternEntryCount); uint64_t FuncRawBranchCount = 0; for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index f1fe45f21a0f6..f4308d6fc1992 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -226,6 +226,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, YamlBF.Hash = BF.getHash(); YamlBF.NumBasicBlocks = BF.size(); YamlBF.ExecCount = BF.getKnownExecutionCount(); + YamlBF.ExternEntryCount = BF.getExternEntryCount(); DenseMap InlineTreeNodeId; if (PseudoProbeDecoder && BF.getGUID()) { std::tie(YamlBF.InlineTree, InlineTreeNodeId) = diff --git a/bolt/test/X86/shrinkwrapping.test b/bolt/test/X86/shrinkwrapping.test index 8581d7e0c0f7b..521b4561b3ba6 100644 --- a/bolt/test/X86/shrinkwrapping.test +++ b/bolt/test/X86/shrinkwrapping.test @@ -8,6 +8,7 @@ REQUIRES: shell RUN: %clangxx %cxxflags -no-pie %S/Inputs/exc4sw.S -o %t.exe -Wl,-q RUN: llvm-bolt %t.exe -o %t --relocs --frame-opt=all \ +RUN: --print-only=main --print-cfg \ RUN: --data=%p/Inputs/exc4sw.fdata --reorder-blocks=cache 2>&1 | \ RUN: FileCheck %s --check-prefix=CHECK-BOLT @@ -19,6 +20,7 @@ RUN: llvm-objdump --dwarf=frames %t | grep -A20 -e \ RUN: `llvm-nm --numeric-sort %t | grep main | tail -n 1 | cut -f1 -d' ' | \ RUN: tail -c9` 2>&1 | FileCheck %s --check-prefix=CHECK-OUTPUT +CHECK-BOLT: Extern Entry Count: 100 CHECK-BOLT: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops CHECK-INPUT: DW_CFA_advance_loc: 2 From 81f92265327a8cf6e730ad1d01fd02f4ef76ed86 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Sat, 7 Jun 2025 21:11:24 -0700 Subject: [PATCH 2/5] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?= =?UTF-8?q?anges=20introduced=20through=20rebase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.4 [skip ci] --- bolt/lib/Profile/DataAggregator.cpp | 4 +++- bolt/test/X86/pre-aggregated-perf.test | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index addff196f4f5b..0e6abdb2052af 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -733,8 +733,10 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, // corresponds to a return (if \p IsFrom) or a call continuation (otherwise). auto handleAddress = [&](uint64_t &Addr, bool IsFrom) { BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); - if (!Func) + if (!Func) { + Addr = 0; return std::pair{Func, false}; + } Addr -= Func->getAddress(); diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test index 92e093c238e00..cc79cbd339505 100644 --- a/bolt/test/X86/pre-aggregated-perf.test +++ b/bolt/test/X86/pre-aggregated-perf.test @@ -67,10 +67,10 @@ BASIC-ERROR: BOLT-INFO: 0 out of 7 functions in the binary (0.0%) have non-empty BASIC-SUCCESS: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile CHECK-BASIC-NL: no_lbr cycles -PERF2BOLT: 0 [unknown] 7f36d18d60c0 1 main 53c 0 2 +PERF2BOLT: 0 [unknown] 0 1 main 53c 0 2 PERF2BOLT: 1 main 451 1 SolveCubic 0 0 2 -PERF2BOLT: 1 main 490 0 [unknown] 4005f0 0 1 -PERF2BOLT: 1 main 537 0 [unknown] 400610 0 1 +PERF2BOLT: 1 main 490 0 [unknown] 0 0 1 +PERF2BOLT: 1 main 537 0 [unknown] 0 0 1 PERF2BOLT: 1 usqrt 30 1 usqrt 32 0 22 PERF2BOLT: 1 usqrt 30 1 usqrt 39 4 33 PERF2BOLT: 1 usqrt 35 1 usqrt 39 0 22 From 867bac6dfba4b68746775aa1ebfca0667b5ee7c7 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 9 Jun 2025 15:57:14 -0700 Subject: [PATCH 3/5] cleanup Created using spr 1.3.4 --- bolt/include/bolt/Profile/DataAggregator.h | 19 +++++++++---------- bolt/lib/Profile/DataAggregator.cpp | 14 +++++++++----- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 1e115b0231055..10d96fbeca3e2 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -101,27 +101,26 @@ class DataAggregator : public DataReader { /// Container for the unit of branch data. /// Backwards compatible with legacy use for branches and fall-throughs: - /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only contains - /// fall-through data, - /// - if \p To is EXTERNAL, the trace only contains branch data. + /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only + /// contains fall-through data, + /// - if \p To is BR_ONLY, the trace only contains branch data. struct Trace { static constexpr const uint64_t EXTERNAL = 0ULL; + static constexpr const uint64_t BR_ONLY = -1ULL; static constexpr const uint64_t FT_ONLY = -1ULL; static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL; uint64_t Branch; uint64_t From; uint64_t To; - bool operator==(const Trace &Other) const { - return Branch == Other.Branch && From == Other.From && To == Other.To; - } + auto tie() const { return std::tie(Branch, From, To); } + bool operator==(const Trace &Other) const { return tie() == Other.tie(); } + bool operator<(const Trace &Other) const { return tie() < Other.tie(); } }; friend raw_ostream &operator<<(raw_ostream &OS, const Trace &); struct TraceHash { - size_t operator()(const Trace &L) const { - return llvm::hash_combine(L.Branch, L.From, L.To); - } + size_t operator()(const Trace &L) const { return hash_combine(L.tie()); } }; struct TakenBranchInfo { @@ -531,7 +530,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, OS << Twine::utohexstr(T.Branch) << " -> "; } OS << Twine::utohexstr(T.From); - if (T.To) + if (T.To != DataAggregator::Trace::BR_ONLY) OS << " ... " << Twine::utohexstr(T.To); return OS; } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index bd7e550569140..8e92c7ba1668e 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -514,6 +514,10 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { deleteTempFiles(); heatmap: + // Sort parsed traces for faster processing. + if (!opts::BasicAggregation) + llvm::sort(Traces, llvm::less_first()); + if (!opts::HeatmapMode) return Error::success(); @@ -1283,7 +1287,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { } if (Type == BRANCH) { - Addr[2] = Location(Trace::EXTERNAL); + Addr[2] = Location(Trace::BR_ONLY); } Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset}; @@ -1291,7 +1295,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { Traces.emplace_back(T, TI); - if (Addr[2]->Offset) + if (Addr[2]->Offset != Trace::BR_ONLY) NumTraces += Count; NumTotalSamples += Count; @@ -1305,7 +1309,7 @@ bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const { std::error_code DataAggregator::printLBRHeatMap() { outs() << "PERF2BOLT: parse branch events...\n"; - NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, + NamedRegionTimer T("buildHeatmap", "Building heatmap", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); if (BC->IsLinuxKernel) { @@ -1342,7 +1346,7 @@ std::error_code DataAggregator::printLBRHeatMap() { for (const auto &[PC, Hits] : BasicSamples) HM.registerAddress(PC, Hits); for (const auto &[Trace, Info] : Traces) - if (Trace.To) + if (Trace.To != Trace::BR_ONLY) HM.registerAddressRange(Trace.From, Trace.To, Info.TakenCount); if (HM.getNumInvalidRanges()) @@ -1540,7 +1544,7 @@ void DataAggregator::processBranchEvents() { TimerGroupName, TimerGroupDesc, opts::TimeAggregator); for (const auto &[Trace, Info] : Traces) { - if (Trace.To) + if (Trace.To != Trace::BR_ONLY) doTrace(Trace, Info.TakenCount); if (Trace.Branch != Trace::FT_ONLY && Trace.Branch != Trace::FT_EXTERNAL_ORIGIN) From 137df98ba094b648b4e185a10cae8b7c0b5581c7 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 9 Jun 2025 17:07:43 -0700 Subject: [PATCH 4/5] drop use of external Created using spr 1.3.4 --- bolt/lib/Profile/DataAggregator.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 8e92c7ba1668e..5f384e7f60f25 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -1286,17 +1286,15 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN); } - if (Type == BRANCH) { + if (Type == BRANCH) Addr[2] = Location(Trace::BR_ONLY); - } + else + NumTraces += Count; Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset}; TakenBranchInfo TI{(uint64_t)Count, (uint64_t)Mispreds}; - Traces.emplace_back(T, TI); - if (Addr[2]->Offset != Trace::BR_ONLY) - NumTraces += Count; NumTotalSamples += Count; return std::error_code(); @@ -1393,7 +1391,7 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, // chronological order) if (NeedsSkylakeFix && NumEntry <= 2) continue; - uint64_t TraceTo = Trace::EXTERNAL; + uint64_t TraceTo = Trace::BR_ONLY; if (NextLBR) { TraceTo = NextLBR->From; ++NumTraces; From 5d99da3ce969a1f1ccf0922cbc978ff5a725ee66 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 9 Jun 2025 17:12:34 -0700 Subject: [PATCH 5/5] drop accidental change Created using spr 1.3.4 --- bolt/lib/Profile/DataAggregator.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 5f384e7f60f25..949014e8cc1e2 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -1286,15 +1286,18 @@ std::error_code DataAggregator::parseAggregatedLBREntry() { Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN); } - if (Type == BRANCH) + if (Type == BRANCH) { Addr[2] = Location(Trace::BR_ONLY); - else - NumTraces += Count; + } Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset}; TakenBranchInfo TI{(uint64_t)Count, (uint64_t)Mispreds}; + Traces.emplace_back(T, TI); + if (Addr[2]->Offset != Trace::BR_ONLY) + NumTraces += Count; + NumTotalSamples += Count; return std::error_code();