From 9c5b3401612f61cdb473a63ad66fb6fe4d67df71 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 13 Sep 2024 21:17:02 -0700 Subject: [PATCH 1/5] [BOLT][AArch64] Add support for compact code model Add `--compact-code-model` option that executes alternative branch relaxation with an assumption that the resulting binary has less than 128MB of code. The relaxation is done in `relaxLocalBranches()`, which operates on a function level and executes on multiple functions in parallel. Running the new pass on AArch64 Clang binary produces slightly smaller code and finishes in about 1/10th of the time. Note that the new .text has to be smaller than 128MB, *and* .plt has to be closer than 128MB to the new code. --- bolt/include/bolt/Core/BinaryBasicBlock.h | 3 + bolt/include/bolt/Core/FunctionLayout.h | 3 +- bolt/include/bolt/Passes/LongJmp.h | 13 + bolt/lib/Core/FunctionLayout.cpp | 4 +- bolt/lib/Passes/LongJmp.cpp | 278 +++++++++++++++++++++- bolt/test/AArch64/compact-code-model.s | 48 ++++ 6 files changed, 344 insertions(+), 5 deletions(-) create mode 100644 bolt/test/AArch64/compact-code-model.s diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h index b4f31cf2bae6f..25cccc4edecf6 100644 --- a/bolt/include/bolt/Core/BinaryBasicBlock.h +++ b/bolt/include/bolt/Core/BinaryBasicBlock.h @@ -819,6 +819,9 @@ class BinaryBasicBlock { return OutputAddressRange; } + uint64_t getOutputStartAddress() const { return OutputAddressRange.first; } + uint64_t getOutputEndAddress() const { return OutputAddressRange.second; } + bool hasLocSyms() const { return LocSyms != nullptr; } /// Return mapping of input offsets to symbols in the output. diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h index 6a13cbec69fee..ee4dd689b8dd6 100644 --- a/bolt/include/bolt/Core/FunctionLayout.h +++ b/bolt/include/bolt/Core/FunctionLayout.h @@ -123,7 +123,8 @@ class FunctionFragment { const_iterator begin() const; iterator end(); const_iterator end() const; - const BinaryBasicBlock *front() const; + BinaryBasicBlock *front() const; + BinaryBasicBlock *back() const; friend class FunctionLayout; }; diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h index 3d02d75ac4a27..df3ea9620918a 100644 --- a/bolt/include/bolt/Passes/LongJmp.h +++ b/bolt/include/bolt/Passes/LongJmp.h @@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass { uint32_t NumColdStubs{0}; uint32_t NumSharedStubs{0}; + /// The shortest distance for any branch instruction on AArch64. + static constexpr size_t ShortestJumpBits = 16; + static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1); + + /// The longest single-instruction branch. + static constexpr size_t LongestJumpBits = 28; + static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1); + + /// Relax all internal function branches including those between fragments. + /// Assume that fragments are placed in different sections but are within + /// 128MB of each other. + void relaxLocalBranches(BinaryFunction &BF); + /// -- Layout estimation methods -- /// Try to do layout before running the emitter, by looking at BinaryFunctions /// and MCInsts -- this is an estimation. To be correct for longjmp inserter diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp index 15e6127ad2e9e..4498fc44da954 100644 --- a/bolt/lib/Core/FunctionLayout.cpp +++ b/bolt/lib/Core/FunctionLayout.cpp @@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const { return const_iterator(Layout->block_begin() + StartIndex + Size); } -const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); } +BinaryBasicBlock *FunctionFragment::front() const { return *begin(); } + +BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); } FunctionLayout::FunctionLayout() { addFragment(); } diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index c483f70a836ee..4ce2322ab4352 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -11,18 +11,26 @@ //===----------------------------------------------------------------------===// #include "bolt/Passes/LongJmp.h" +#include "bolt/Core/ParallelUtilities.h" +#include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "longjmp" using namespace llvm; namespace opts { +extern cl::OptionCategory BoltCategory; extern cl::OptionCategory BoltOptCategory; extern llvm::cl::opt AlignText; extern cl::opt AlignFunctions; extern cl::opt UseOldText; extern cl::opt HotFunctionsAtEnd; +static cl::opt + CompactCodeModel("compact-code-model", + cl::desc("generate code for binaries <128MB on AArch64"), + cl::init(false), cl::cat(BoltCategory)); + static cl::opt GroupStubs("group-stubs", cl::desc("share stubs across functions"), cl::init(true), cl::cat(BoltOptCategory)); @@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { if (Next != E && (*Next)->isCold()) return *I; } - llvm_unreachable("No hot-colt split point found"); + llvm_unreachable("No hot-cold split point found"); } -static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { +static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) { return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); } @@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { if (BC.MIB->isPseudo(Inst)) continue; - if (!shouldInsertStub(BC, Inst)) { + if (!mayNeedStub(BC, Inst)) { DotAddress += InsnSize; continue; } @@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { return Error::success(); } +void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { + BinaryContext &BC = BF.getBinaryContext(); + auto &MIB = BC.MIB; + + if (!BF.isSimple()) + return; + + // Quick path. + if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) + return; + + auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) { + const unsigned Bits = MIB->getPCRelEncodingSize(Inst); + return isIntN(Bits, Offset); + }; + + auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress, + const BinaryBasicBlock &BB) { + const int64_t Offset = BB.getOutputStartAddress() - InstAddress; + return isBranchOffsetInRange(Inst, Offset); + }; + + // Keep track of *all* function trampolines that are going to be added to the + // function layout at the end of relaxation. + std::vector>> + FunctionTrampolines; + + // Function fragments are relaxed independently. + for (FunctionFragment &FF : BF.getLayout().fragments()) { + // Fill out code size estimation for the fragment. Use output BB address + // ranges to store offsets from the start of the function. + uint64_t CodeSize = 0; + for (BinaryBasicBlock *BB : FF) { + BB->setOutputStartAddress(CodeSize); + CodeSize += BB->estimateSize(); + BB->setOutputEndAddress(CodeSize); + } + + // Dynamically-updated size of the fragment. + uint64_t FragmentSize = CodeSize; + + // Size of the trampoline in bytes. + constexpr uint64_t TrampolineSize = 4; + + // Trampolines created for the fragment. DestinationBB -> TrampolineBB. + // NB: here we store only the first trampoline created for DestinationBB. + DenseMap FragmentTrampolines; + + // Create a trampoline code after \p BB or at the end of the fragment if BB + // is nullptr. + auto addTrampolineAfter = [&](BinaryBasicBlock *BB, + BinaryBasicBlock *TargetBB, uint64_t Count, + bool UpdateOffsets = true) { + std::unique_ptr TrampolineBB = BF.createBasicBlock(); + MCInst Inst; + { + auto L = BC.scopeLock(); + MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get()); + } + TrampolineBB->addInstruction(Inst); + TrampolineBB->addSuccessor(TargetBB, Count); + TrampolineBB->setExecutionCount(Count); + const uint64_t TrampolineAddress = + BB ? BB->getOutputEndAddress() : FragmentSize; + TrampolineBB->setOutputStartAddress(TrampolineAddress); + TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize); + TrampolineBB->setFragmentNum(FF.getFragmentNum()); + + if (UpdateOffsets) { + FragmentSize += TrampolineSize; + for (BinaryBasicBlock *IBB : FF) { + if (IBB->getOutputStartAddress() >= TrampolineAddress) { + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + + TrampolineSize); + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + + TrampolineSize); + } + } + for (auto &Pair : FunctionTrampolines) { + BinaryBasicBlock *IBB = Pair.second.get(); + if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) + continue; + if (IBB == TrampolineBB.get()) + continue; + if (IBB->getOutputStartAddress() >= TrampolineAddress) { + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + + TrampolineSize); + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + + TrampolineSize); + } + } + } + + if (!FragmentTrampolines.lookup(TargetBB)) + FragmentTrampolines[TargetBB] = TrampolineBB.get(); + FunctionTrampolines.emplace_back(BB ? BB : FF.back(), + std::move(TrampolineBB)); + + return FunctionTrampolines.back().second.get(); + }; + + // Pre-populate trampolines by splitting unconditional branches from the + // containing basic block. + for (BinaryBasicBlock *BB : FF) { + MCInst *Inst = BB->getLastNonPseudoInstr(); + if (!Inst || !MIB->isUnconditionalBranch(*Inst)) + continue; + + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst); + BB->eraseInstruction(BB->findInstruction(Inst)); + BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize); + + BinaryBasicBlock::BinaryBranchInfo BI; + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI); + + BinaryBasicBlock *TrampolineBB = + addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false); + BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); + } + + /// Relax the branch \p Inst. Return true if basic block offsets need an + /// update after the trampoline insertion. + auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, + uint64_t InstAddress, BinaryBasicBlock *TargetBB) { + BinaryFunction *BF = BB->getParent(); + + // Use branch taken count for optimal relaxation. + const uint64_t Count = BB->getBranchInfo(*TargetBB).Count; + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE && + "Expected valid branch execution count"); + + // Try to reuse an existing trampoline without introducing any new code. + BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB); + if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) { + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); + TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() + + Count); + auto L = BC.scopeLock(); + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); + return; + } + + // For cold branches, check if we can introduce a trampoline at the end + // of the fragment that is within the branch reach. Note that such + // trampoline may change address later and become unreachable in which + // case we will need further relaxation. + const int64_t OffsetToEnd = FragmentSize - InstAddress; + if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) { + TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count); + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); + auto L = BC.scopeLock(); + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); + + return; + } + + // Insert a new block after the current one and use it as a trampoline. + TrampolineBB = addTrampolineAfter(BB, TargetBB, Count); + + // If the other successor is a fall-through, invert the condition code. + const BinaryBasicBlock *const NextBB = + BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false); + if (BB->getConditionalSuccessor(false) == NextBB) { + BB->swapConditionalSuccessors(); + auto L = BC.scopeLock(); + MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get()); + } else { + auto L = BC.scopeLock(); + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); + } + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); + }; + + bool MayNeedRelaxation; + uint64_t NumIterations = 0; + do { + MayNeedRelaxation = false; + ++NumIterations; + for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) { + BinaryBasicBlock *BB = *BBI; + uint64_t NextInstOffset = BB->getOutputStartAddress(); + for (MCInst &Inst : *BB) { + const size_t InstAddress = NextInstOffset; + if (!MIB->isPseudo(Inst)) + NextInstOffset += 4; + + if (!mayNeedStub(BF.getBinaryContext(), Inst)) + continue; + + const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst); + + // Span of +/-128MB. + if (BitsAvailable == LongestJumpBits) + continue; + + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst); + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol); + assert(TargetBB && + "Basic block target expected for conditional branch."); + + // Check if the relaxation is needed. + if (TargetBB->getFragmentNum() == FF.getFragmentNum() && + isBlockInRange(Inst, InstAddress, *TargetBB)) + continue; + + relaxBranch(BB, Inst, InstAddress, TargetBB); + + MayNeedRelaxation = true; + } + } + + // We may have added new instructions, but the whole fragment is less than + // the minimum branch span. + if (FragmentSize < ShortestJumpSpan) + MayNeedRelaxation = false; + + } while (MayNeedRelaxation); + + LLVM_DEBUG({ + if (NumIterations > 2) { + dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get() + << " of " << BF << " in " << NumIterations << " iterations\n"; + } + }); + } + + // Add trampoline blocks from all fragments to the layout. + DenseMap>> + Insertions; + for (std::pair> &Pair : + FunctionTrampolines) { + if (!Pair.second) + continue; + Insertions[Pair.first].emplace_back(std::move(Pair.second)); + } + + for (auto &Pair : Insertions) { + BF.insertBasicBlocks(Pair.first, std::move(Pair.second), + /*UpdateLayout*/ true, /*UpdateCFI*/ true, + /*RecomputeLPs*/ false); + } +} + Error LongJmpPass::runOnFunctions(BinaryContext &BC) { + + if (opts::CompactCodeModel) { + BC.outs() + << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n"; + + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { + relaxLocalBranches(BF); + }; + + ParallelUtilities::PredicateTy SkipPredicate = + [&](const BinaryFunction &BF) { + return !BC.shouldEmit(BF) || !BF.isSimple(); + }; + + ParallelUtilities::runOnEachFunction( + BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, + SkipPredicate, "RelaxLocalBranches"); + + return Error::success(); + } + BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n"; std::vector Sorted = BC.getSortedFunctions(); bool Modified; diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s new file mode 100644 index 0000000000000..c8d8ac9131b45 --- /dev/null +++ b/bolt/test/AArch64/compact-code-model.s @@ -0,0 +1,48 @@ +## Check that llvm-bolt successfully relaxes branches for compact (<128MB) code +## model. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \ +# RUN: --keep-nops --compact-code-model +# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s +# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM + +## _start will be split and its main fragment will be separated from other +## fragments by large_function() which is over 1MB. + +# CHECK-NM: _start +# CHECK-NM-NEXT: large_function +# CHECK-NM-NEXT: _start.cold + + .text + .globl _start + .type _start, %function +_start: + .cfi_startproc + cmp x1, 1 + b.hi .L1 +# CHECK: b.hi +# CHECK-NEXT: b +# CHECK-NEXT: b + + bl large_function +.L1: + ret x30 + .cfi_endproc +.size _start, .-_start + + + .globl large_function + .type large_function, %function +large_function: + .cfi_startproc + .rept 300000 + nop + .endr + ret x30 + .cfi_endproc +.size large_function, .-large_function + +## Force relocation mode. + .reloc 0, R_AARCH64_NONE From 3cabeab3b4cc31b30f36d49df4eb9f8f7540184a Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 21 Oct 2024 12:34:49 -0700 Subject: [PATCH 2/5] fixup! [BOLT][AArch64] Add support for compact code model --- bolt/lib/Passes/LongJmp.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 4ce2322ab4352..279ff63faf11e 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -641,9 +641,6 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { BinaryContext &BC = BF.getBinaryContext(); auto &MIB = BC.MIB; - if (!BF.isSimple()) - return; - // Quick path. if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) return; @@ -667,7 +664,7 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { // Function fragments are relaxed independently. for (FunctionFragment &FF : BF.getLayout().fragments()) { // Fill out code size estimation for the fragment. Use output BB address - // ranges to store offsets from the start of the function. + // ranges to store offsets from the start of the function fragment. uint64_t CodeSize = 0; for (BinaryBasicBlock *BB : FF) { BB->setOutputStartAddress(CodeSize); @@ -757,8 +754,9 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); } - /// Relax the branch \p Inst. Return true if basic block offsets need an - /// update after the trampoline insertion. + /// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB. + /// \p InstAddress contains offset of the branch from the start of the + /// containing function fragment. auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, uint64_t InstAddress, BinaryBasicBlock *TargetBB) { BinaryFunction *BF = BB->getParent(); From 4dc0221490c6ce812c85b31f148e1e3ad83171a2 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 22 Oct 2024 15:24:27 -0700 Subject: [PATCH 3/5] fixup! fixup! [BOLT][AArch64] Add support for compact code model --- bolt/test/AArch64/compact-code-model.s | 66 +++++++++++++++++++++----- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/bolt/test/AArch64/compact-code-model.s b/bolt/test/AArch64/compact-code-model.s index c8d8ac9131b45..0805302a88598 100644 --- a/bolt/test/AArch64/compact-code-model.s +++ b/bolt/test/AArch64/compact-code-model.s @@ -2,40 +2,84 @@ ## model. # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static -# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \ +# RUN: llvm-bolt %t.exe -o %t.bolt --data %t.fdata --split-functions \ # RUN: --keep-nops --compact-code-model -# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt | FileCheck %s -# RUN: llvm-nm -n %t.bolt | FileCheck %s --check-prefix=CHECK-NM +# RUN: llvm-objdump -d \ +# RUN: --disassemble-symbols=_start,_start.cold.0,foo,foo.cold.0 %t.bolt \ +# RUN: | FileCheck %s +# RUN: llvm-nm -nS %t.bolt | FileCheck %s --check-prefix=CHECK-NM -## _start will be split and its main fragment will be separated from other -## fragments by large_function() which is over 1MB. +## Fragments of _start and foo will be separated by large_function which is over +## 1MB in size - larger than all conditional branches can cover requiring branch +## relaxation. # CHECK-NM: _start -# CHECK-NM-NEXT: large_function -# CHECK-NM-NEXT: _start.cold +# CHECK-NM: foo +# CHECK-NM: 0000000000124f84 T large_function +# CHECK-NM: _start.cold.0 +# CHECK-NM: foo.cold.0 .text .globl _start .type _start, %function _start: +# CHECK: <_start>: +# FDATA: 0 [unknown] 0 1 _start 0 0 100 .cfi_startproc - cmp x1, 1 - b.hi .L1 -# CHECK: b.hi + cmp x0, 1 + b.eq .L0 +# CHECK: b.eq # CHECK-NEXT: b # CHECK-NEXT: b bl large_function -.L1: +.L0: ret x30 .cfi_endproc .size _start, .-_start +## Check that long branch in foo() is reused during relaxation. I.e. we should +## see just one branch to the cold fragment. + + .globl foo + .type foo, %function +foo: +# CHECK: : +# FDATA: 0 [unknown] 0 1 foo 0 0 100 + .cfi_startproc + cmp x0, 0 +.T0: + b.eq .ERROR +# CHECK: b {{.*}} +# CHECK-NOT: b {{.*}} +# FDATA: 1 foo #.T0# 1 foo #.T1# 0 100 +.T1: + bl large_function + cmp x0, 1 +.T2: + b.eq .ERROR +# FDATA: 1 foo #.T2# 1 foo #.T3# 0 100 +.T3: + mov x1, x0 + mov x0, 0 + ret x30 + +# CHECK: : +# CHECK-NEXT: mov x0, #0x1 +# CHECK-NEXT: ret +.ERROR: + mov x0, 1 + ret x30 + .cfi_endproc +.size foo, .-foo .globl large_function .type large_function, %function large_function: +# FDATA: 0 [unknown] 0 1 large_function 0 0 100 .cfi_startproc .rept 300000 nop From 6641d9f967073a3e6dd0d0d54392f749414a7629 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 23 Oct 2024 15:25:45 -0700 Subject: [PATCH 4/5] fixup! fixup! fixup! [BOLT][AArch64] Add support for compact code model --- bolt/lib/Passes/LongJmp.cpp | 67 ++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 279ff63faf11e..274b33b54373b 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -683,11 +683,15 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { DenseMap FragmentTrampolines; // Create a trampoline code after \p BB or at the end of the fragment if BB - // is nullptr. + // is nullptr. If /p UpdateOffsets is true, update FragmentSize and offsets + // for basic blocks affected by the insertion of the trampoline. auto addTrampolineAfter = [&](BinaryBasicBlock *BB, BinaryBasicBlock *TargetBB, uint64_t Count, bool UpdateOffsets = true) { - std::unique_ptr TrampolineBB = BF.createBasicBlock(); + FunctionTrampolines.emplace_back(BB ? BB : FF.back(), + BF.createBasicBlock()); + BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get(); + MCInst Inst; { auto L = BC.scopeLock(); @@ -702,37 +706,46 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize); TrampolineBB->setFragmentNum(FF.getFragmentNum()); - if (UpdateOffsets) { - FragmentSize += TrampolineSize; - for (BinaryBasicBlock *IBB : FF) { - if (IBB->getOutputStartAddress() >= TrampolineAddress) { - IBB->setOutputStartAddress(IBB->getOutputStartAddress() + - TrampolineSize); - IBB->setOutputEndAddress(IBB->getOutputEndAddress() + + if (!FragmentTrampolines.lookup(TargetBB)) + FragmentTrampolines[TargetBB] = TrampolineBB; + + if (!UpdateOffsets) + return TrampolineBB; + + FragmentSize += TrampolineSize; + + // If the trampoline was added at the end of the fragment, offsets of + // other fragments should stay intact. + if (!BB) + return TrampolineBB; + + // Update offsets for blocks after BB. + for (BinaryBasicBlock *IBB : FF) { + if (IBB->getOutputStartAddress() >= TrampolineAddress) { + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + TrampolineSize); - } + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); } - for (auto &Pair : FunctionTrampolines) { - BinaryBasicBlock *IBB = Pair.second.get(); - if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) - continue; - if (IBB == TrampolineBB.get()) - continue; - if (IBB->getOutputStartAddress() >= TrampolineAddress) { - IBB->setOutputStartAddress(IBB->getOutputStartAddress() + - TrampolineSize); - IBB->setOutputEndAddress(IBB->getOutputEndAddress() + + } + + // Update offsets for trampolines in this fragment that are placed after + // the new trampoline. Note that trampoline blocks are not part of the + // function/fragment layout until we add them right before the return + // from relaxLocalBranches(). + for (auto &Pair : FunctionTrampolines) { + BinaryBasicBlock *IBB = Pair.second.get(); + if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) + continue; + if (IBB == TrampolineBB) + continue; + if (IBB->getOutputStartAddress() >= TrampolineAddress) { + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + TrampolineSize); - } + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); } } - if (!FragmentTrampolines.lookup(TargetBB)) - FragmentTrampolines[TargetBB] = TrampolineBB.get(); - FunctionTrampolines.emplace_back(BB ? BB : FF.back(), - std::move(TrampolineBB)); - - return FunctionTrampolines.back().second.get(); + return TrampolineBB; }; // Pre-populate trampolines by splitting unconditional branches from the From 96b858b3b3544ac318595a599ac13f8ff344aac3 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 29 Oct 2024 11:11:59 -0700 Subject: [PATCH 5/5] fixup! fixup! fixup! fixup! [BOLT][AArch64] Add support for compact code model --- bolt/lib/Passes/LongJmp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 274b33b54373b..f54afc28d0da8 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -683,7 +683,7 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { DenseMap FragmentTrampolines; // Create a trampoline code after \p BB or at the end of the fragment if BB - // is nullptr. If /p UpdateOffsets is true, update FragmentSize and offsets + // is nullptr. If \p UpdateOffsets is true, update FragmentSize and offsets // for basic blocks affected by the insertion of the trampoline. auto addTrampolineAfter = [&](BinaryBasicBlock *BB, BinaryBasicBlock *TargetBB, uint64_t Count,