diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 10d96fbeca3e2..96969cf53baca 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -132,6 +132,9 @@ class DataAggregator : public DataReader { /// and use them later for processing and assigning profile. std::unordered_map TraceMap; std::vector> Traces; + /// Pre-populated addresses of returns, coming from pre-aggregated data or + /// disassembly. Used to disambiguate call-continuation fall-throughs. + std::unordered_set Returns; std::unordered_map BasicSamples; std::vector MemSamples; @@ -204,8 +207,8 @@ class DataAggregator : public DataReader { /// Return a vector of offsets corresponding to a trace in a function /// if the trace is valid, std::nullopt otherwise. std::optional, 16>> - getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, - uint64_t Count) const; + getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count, + bool IsReturn) const; /// Record external entry into the function \p BF. /// @@ -265,11 +268,14 @@ class DataAggregator : public DataReader { uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); + /// Checks if \p Addr corresponds to a return instruction. + bool checkReturn(uint64_t Addr); + /// Register a \p Branch. bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); /// Register a trace between two LBR entries supplied in execution order. - bool doTrace(const Trace &Trace, uint64_t Count); + bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn); /// Parser helpers /// Return false if we exhausted our parser buffer and finished parsing diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 118629b04f6fc..178c9d3a63730 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -730,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, return true; } +bool DataAggregator::checkReturn(uint64_t Addr) { + auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); }; + if (llvm::is_contained(Returns, Addr)) + return true; + + BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); + if (!Func) + return false; + + const uint64_t Offset = Addr - Func->getAddress(); + if (Func->hasInstructions() + ? isReturn(Func->getInstructionAtOffset(Offset)) + : isReturn(Func->disassembleInstructionAtOffset(Offset))) { + Returns.emplace(Addr); + return true; + } + return false; +} + bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds) { - // Returns whether \p Offset in \p Func contains a return instruction. - auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) { - auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); }; - return Func.hasInstructions() - ? isReturn(Func.getInstructionAtOffset(Offset)) - : isReturn(Func.disassembleInstructionAtOffset(Offset)); - }; - // Mutates \p Addr to an offset into the containing function, performing BAT // offset translation and parent lookup. // - // Returns the containing function (or BAT parent) and whether the address - // corresponds to a return (if \p IsFrom) or a call continuation (otherwise). + // Returns the containing function (or BAT parent). auto handleAddress = [&](uint64_t &Addr, bool IsFrom) { BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr); if (!Func) { Addr = 0; - return std::pair{Func, false}; + return Func; } Addr -= Func->getAddress(); - bool IsRet = IsFrom && checkReturn(*Func, Addr); - if (BAT) Addr = BAT->translate(Func->getAddress(), Addr, IsFrom); if (BinaryFunction *ParentFunc = getBATParentFunction(*Func)) - Func = ParentFunc; + return ParentFunc; - return std::pair{Func, IsRet}; + return Func; }; - auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true); - auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false); + BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true); + BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false); if (!FromFunc && !ToFunc) return false; - // Ignore returns. - if (IsReturn) - return true; - // Treat recursive control transfers as inter-branches. if (FromFunc == ToFunc && To != 0) { recordBranch(*FromFunc, From, To, Count, Mispreds); @@ -783,7 +787,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds); } -bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) { +bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count, + bool IsReturn) { const uint64_t From = Trace.From, To = Trace.To; BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From); BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To); @@ -808,8 +813,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) { const uint64_t FuncAddress = FromFunc->getAddress(); std::optional FTs = BAT && BAT->isBATFunction(FuncAddress) - ? BAT->getFallthroughsInTrace(FuncAddress, From, To) - : getFallthroughsInTrace(*FromFunc, Trace, Count); + ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To) + : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn); if (!FTs) { LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n'); NumInvalidTraces += Count; @@ -831,7 +836,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) { std::optional, 16>> DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, - uint64_t Count) const { + uint64_t Count, bool IsReturn) const { SmallVector, 16> Branches; BinaryContext &BC = BF.getBinaryContext(); @@ -865,9 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, // Adjust FromBB if the first LBR is a return from the last instruction in // the previous block (that instruction should be a call). - if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) && - From == FromBB->getOffset() && !FromBB->isEntryPoint() && - !FromBB->isLandingPad()) { + if (IsReturn) { + if (From) + FromBB = BF.getBasicBlockContainingOffset(From - 1); + else + LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n'); + } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() && + !FromBB->isEntryPoint() && !FromBB->isLandingPad()) { const BinaryBasicBlock *PrevBB = BF.getLayout().getBlock(FromBB->getIndex() - 1); if (PrevBB->getSuccessor(FromBB->getLabel())) { @@ -1557,11 +1566,13 @@ void DataAggregator::processBranchEvents() { TimerGroupName, TimerGroupDesc, opts::TimeAggregator); for (const auto &[Trace, Info] : Traces) { - if (Trace.Branch != Trace::FT_ONLY && + bool IsReturn = checkReturn(Trace.Branch); + // Ignore returns. + if (!IsReturn && Trace.Branch != Trace::FT_ONLY && Trace.Branch != Trace::FT_EXTERNAL_ORIGIN) doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount); if (Trace.To != Trace::BR_ONLY) - doTrace(Trace, Info.TakenCount); + doTrace(Trace, Info.TakenCount, IsReturn); } printBranchSamplesDiagnostics(); } diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 4994cfb541eef..c2ef024db9475 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -4,29 +4,43 @@ # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so ## Link against a DSO to ensure PLT entries. # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib -# RUN: link_fdata %s %t %t.pat PREAGGT1 -# RUN: link_fdata %s %t %t.pat2 PREAGGT2 -# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT +# Trace to a call continuation, not a landing pad/entry point +# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE +# Trace from a return to a landing pad/entry point call continuation +# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET +# Trace from an external location to a landing pad/entry point call continuation +# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT +# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT # RUN: llvm-strip --strip-unneeded %t -o %t.strip # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh ## Check pre-aggregated traces attach call continuation fallthrough count -# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \ -# RUN: --print-cfg --print-only=main | FileCheck %s - -## Check pre-aggregated traces don't attach call continuation fallthrough count -## to secondary entry point (unstripped) -# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \ -# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 -## Check pre-aggregated traces don't attach call continuation fallthrough count -## to landing pad (stripped, LP) -# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \ -# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3 +## in the basic case (not an entry point, not a landing pad). +# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE + +## Check pre-aggregated traces from a return attach call continuation +## fallthrough count to secondary entry point (unstripped) +# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH +## Check pre-aggregated traces from a return attach call continuation +## fallthrough count to landing pad (stripped, landing pad) +# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH + +## Check pre-aggregated traces from external location don't attach call +## continuation fallthrough count to secondary entry point (unstripped) +# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP +## Check pre-aggregated traces from external location don't attach call +## continuation fallthrough count to landing pad (stripped, landing pad) +# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \ +# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP ## Check pre-aggregated traces don't report zero-sized PLT fall-through as ## invalid trace -# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \ +# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \ # RUN-DISABLED: --check-prefix=CHECK-PLT # CHECK-PLT: traces mismatching disassembled function contents: 0 @@ -56,11 +70,11 @@ main: Ltmp0_br: callq puts@PLT ## Check PLT traces are accepted -# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3 +# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3 ## Target is an external-origin call continuation -# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2 -# CHECK: callq puts@PLT -# CHECK-NEXT: count: 2 +# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2 +# CHECK-BASE: callq puts@PLT +# CHECK-BASE-NEXT: count: 2 Ltmp1: movq -0x10(%rbp), %rax @@ -71,24 +85,18 @@ Ltmp4: cmpl $0x0, -0x14(%rbp) Ltmp4_br: je Ltmp0 -# CHECK2: je .Ltmp0 -# CHECK2-NEXT: count: 3 movl $0xa, -0x18(%rbp) callq foo ## Target is a binary-local call continuation -# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1 -# CHECK: callq foo -# CHECK-NEXT: count: 1 - -## PLT call continuation fallthrough spanning the call -# CHECK2: callq foo -# CHECK2-NEXT: count: 3 - +# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1 ## Target is a secondary entry point (unstripped) or a landing pad (stripped) -# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2 -# CHECK3: callq foo -# CHECK3-NEXT: count: 0 +# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1 + +# CHECK-ATTACH: callq foo +# CHECK-ATTACH-NEXT: count: 1 +# CHECK-SKIP: callq foo +# CHECK-SKIP-NEXT: count: 0 Ltmp3: cmpl $0x0, -0x18(%rbp)