-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[BOLT][AArch64] Add support for compact code model #112110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
9c5b340
3cabeab
4dc0221
6641d9f
96b858b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,18 +11,26 @@ | |
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include "bolt/Passes/LongJmp.h" | ||
| #include "bolt/Core/ParallelUtilities.h" | ||
| #include "llvm/Support/MathExtras.h" | ||
|
|
||
| #define DEBUG_TYPE "longjmp" | ||
|
|
||
| using namespace llvm; | ||
|
|
||
| namespace opts { | ||
| extern cl::OptionCategory BoltCategory; | ||
| extern cl::OptionCategory BoltOptCategory; | ||
| extern llvm::cl::opt<unsigned> AlignText; | ||
| extern cl::opt<unsigned> AlignFunctions; | ||
| extern cl::opt<bool> UseOldText; | ||
| extern cl::opt<bool> HotFunctionsAtEnd; | ||
|
|
||
| static cl::opt<bool> | ||
| CompactCodeModel("compact-code-model", | ||
| cl::desc("generate code for binaries <128MB on AArch64"), | ||
| cl::init(false), cl::cat(BoltCategory)); | ||
|
|
||
| static cl::opt<bool> GroupStubs("group-stubs", | ||
| cl::desc("share stubs across functions"), | ||
| cl::init(true), cl::cat(BoltOptCategory)); | ||
|
|
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { | |
| if (Next != E && (*Next)->isCold()) | ||
| return *I; | ||
| } | ||
| llvm_unreachable("No hot-colt split point found"); | ||
| llvm_unreachable("No hot-cold split point found"); | ||
| } | ||
|
|
||
| static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { | ||
| static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) { | ||
| return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && | ||
| !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); | ||
| } | ||
|
|
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { | |
| if (BC.MIB->isPseudo(Inst)) | ||
| continue; | ||
|
|
||
| if (!shouldInsertStub(BC, Inst)) { | ||
| if (!mayNeedStub(BC, Inst)) { | ||
| DotAddress += InsnSize; | ||
| continue; | ||
| } | ||
|
|
@@ -629,7 +637,282 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { | |
| return Error::success(); | ||
| } | ||
|
|
||
| void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { | ||
| BinaryContext &BC = BF.getBinaryContext(); | ||
| auto &MIB = BC.MIB; | ||
|
|
||
| // Quick path. | ||
| if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) | ||
| return; | ||
|
|
||
| auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) { | ||
| const unsigned Bits = MIB->getPCRelEncodingSize(Inst); | ||
| return isIntN(Bits, Offset); | ||
| }; | ||
|
|
||
| auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress, | ||
| const BinaryBasicBlock &BB) { | ||
| const int64_t Offset = BB.getOutputStartAddress() - InstAddress; | ||
| return isBranchOffsetInRange(Inst, Offset); | ||
| }; | ||
|
|
||
| // Keep track of *all* function trampolines that are going to be added to the | ||
| // function layout at the end of relaxation. | ||
| std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>> | ||
| FunctionTrampolines; | ||
|
|
||
| // Function fragments are relaxed independently. | ||
| for (FunctionFragment &FF : BF.getLayout().fragments()) { | ||
| // Fill out code size estimation for the fragment. Use output BB address | ||
| // ranges to store offsets from the start of the function fragment. | ||
| uint64_t CodeSize = 0; | ||
| for (BinaryBasicBlock *BB : FF) { | ||
| BB->setOutputStartAddress(CodeSize); | ||
| CodeSize += BB->estimateSize(); | ||
| BB->setOutputEndAddress(CodeSize); | ||
| } | ||
|
|
||
| // Dynamically-updated size of the fragment. | ||
| uint64_t FragmentSize = CodeSize; | ||
|
|
||
| // Size of the trampoline in bytes. | ||
| constexpr uint64_t TrampolineSize = 4; | ||
|
|
||
| // Trampolines created for the fragment. DestinationBB -> TrampolineBB. | ||
| // NB: here we store only the first trampoline created for DestinationBB. | ||
| DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines; | ||
|
|
||
| // Create a trampoline code after \p BB or at the end of the fragment if BB | ||
| // is nullptr. If /p UpdateOffsets is true, update FragmentSize and offsets | ||
| // for basic blocks affected by the insertion of the trampoline. | ||
| auto addTrampolineAfter = [&](BinaryBasicBlock *BB, | ||
| BinaryBasicBlock *TargetBB, uint64_t Count, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (not so sure on this, so consider it as me 'thinking out loud') Would it make sense to add a check on whether But could there be a borderline case where
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If As of now, if the target is out of range, we'll get an error from the emitter when the destination is in the same section or from the JITLink if in a different section. In the future, I expect JITLink to handle latter cases with thunks/veneers.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation.
Re the same section case: I stumbled upon such assertions by the emitter (pre compact-mode) and plan to follow-up on them.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd expect an error from here during the code emission. |
||
| bool UpdateOffsets = true) { | ||
| FunctionTrampolines.emplace_back(BB ? BB : FF.back(), | ||
| BF.createBasicBlock()); | ||
| BinaryBasicBlock *TrampolineBB = FunctionTrampolines.back().second.get(); | ||
|
|
||
| MCInst Inst; | ||
| { | ||
| auto L = BC.scopeLock(); | ||
| MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get()); | ||
| } | ||
| TrampolineBB->addInstruction(Inst); | ||
| TrampolineBB->addSuccessor(TargetBB, Count); | ||
| TrampolineBB->setExecutionCount(Count); | ||
| const uint64_t TrampolineAddress = | ||
| BB ? BB->getOutputEndAddress() : FragmentSize; | ||
| TrampolineBB->setOutputStartAddress(TrampolineAddress); | ||
| TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize); | ||
| TrampolineBB->setFragmentNum(FF.getFragmentNum()); | ||
|
|
||
| if (!FragmentTrampolines.lookup(TargetBB)) | ||
| FragmentTrampolines[TargetBB] = TrampolineBB; | ||
|
|
||
| if (!UpdateOffsets) | ||
| return TrampolineBB; | ||
|
|
||
| FragmentSize += TrampolineSize; | ||
|
|
||
| // If the trampoline was added at the end of the fragment, offsets of | ||
| // other fragments should stay intact. | ||
| if (!BB) | ||
| return TrampolineBB; | ||
|
|
||
| // Update offsets for blocks after BB. | ||
| for (BinaryBasicBlock *IBB : FF) { | ||
| if (IBB->getOutputStartAddress() >= TrampolineAddress) { | ||
| IBB->setOutputStartAddress(IBB->getOutputStartAddress() + | ||
| TrampolineSize); | ||
| IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); | ||
| } | ||
| } | ||
|
|
||
| // Update offsets for trampolines in this fragment that are placed after | ||
| // the new trampoline. Note that trampoline blocks are not part of the | ||
| // function/fragment layout until we add them right before the return | ||
| // from relaxLocalBranches(). | ||
| for (auto &Pair : FunctionTrampolines) { | ||
| BinaryBasicBlock *IBB = Pair.second.get(); | ||
| if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) | ||
| continue; | ||
| if (IBB == TrampolineBB) | ||
| continue; | ||
| if (IBB->getOutputStartAddress() >= TrampolineAddress) { | ||
| IBB->setOutputStartAddress(IBB->getOutputStartAddress() + | ||
| TrampolineSize); | ||
| IBB->setOutputEndAddress(IBB->getOutputEndAddress() + TrampolineSize); | ||
| } | ||
| } | ||
|
|
||
| return TrampolineBB; | ||
| }; | ||
|
|
||
| // Pre-populate trampolines by splitting unconditional branches from the | ||
| // containing basic block. | ||
| for (BinaryBasicBlock *BB : FF) { | ||
| MCInst *Inst = BB->getLastNonPseudoInstr(); | ||
| if (!Inst || !MIB->isUnconditionalBranch(*Inst)) | ||
| continue; | ||
|
|
||
| const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst); | ||
| BB->eraseInstruction(BB->findInstruction(Inst)); | ||
| BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize); | ||
|
|
||
| BinaryBasicBlock::BinaryBranchInfo BI; | ||
| BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI); | ||
|
|
||
| BinaryBasicBlock *TrampolineBB = | ||
| addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false); | ||
| BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); | ||
| } | ||
|
|
||
| /// Relax the branch \p Inst in basic block \p BB that targets \p TargetBB. | ||
| /// \p InstAddress contains offset of the branch from the start of the | ||
| /// containing function fragment. | ||
| auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, | ||
| uint64_t InstAddress, BinaryBasicBlock *TargetBB) { | ||
| BinaryFunction *BF = BB->getParent(); | ||
|
|
||
| // Use branch taken count for optimal relaxation. | ||
| const uint64_t Count = BB->getBranchInfo(*TargetBB).Count; | ||
| assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE && | ||
| "Expected valid branch execution count"); | ||
|
|
||
| // Try to reuse an existing trampoline without introducing any new code. | ||
| BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB); | ||
| if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) { | ||
| BB->replaceSuccessor(TargetBB, TrampolineBB, Count); | ||
| TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() + | ||
| Count); | ||
| auto L = BC.scopeLock(); | ||
| MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); | ||
| return; | ||
| } | ||
|
|
||
| // For cold branches, check if we can introduce a trampoline at the end | ||
| // of the fragment that is within the branch reach. Note that such | ||
| // trampoline may change address later and become unreachable in which | ||
| // case we will need further relaxation. | ||
| const int64_t OffsetToEnd = FragmentSize - InstAddress; | ||
| if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) { | ||
| TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count); | ||
| BB->replaceSuccessor(TargetBB, TrampolineBB, Count); | ||
| auto L = BC.scopeLock(); | ||
| MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); | ||
|
|
||
| return; | ||
| } | ||
|
|
||
| // Insert a new block after the current one and use it as a trampoline. | ||
| TrampolineBB = addTrampolineAfter(BB, TargetBB, Count); | ||
|
|
||
| // If the other successor is a fall-through, invert the condition code. | ||
| const BinaryBasicBlock *const NextBB = | ||
| BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false); | ||
| if (BB->getConditionalSuccessor(false) == NextBB) { | ||
| BB->swapConditionalSuccessors(); | ||
| auto L = BC.scopeLock(); | ||
| MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get()); | ||
| } else { | ||
| auto L = BC.scopeLock(); | ||
| MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); | ||
| } | ||
| BB->replaceSuccessor(TargetBB, TrampolineBB, Count); | ||
| }; | ||
|
|
||
| bool MayNeedRelaxation; | ||
| uint64_t NumIterations = 0; | ||
| do { | ||
| MayNeedRelaxation = false; | ||
| ++NumIterations; | ||
| for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) { | ||
| BinaryBasicBlock *BB = *BBI; | ||
| uint64_t NextInstOffset = BB->getOutputStartAddress(); | ||
| for (MCInst &Inst : *BB) { | ||
| const size_t InstAddress = NextInstOffset; | ||
| if (!MIB->isPseudo(Inst)) | ||
| NextInstOffset += 4; | ||
|
|
||
| if (!mayNeedStub(BF.getBinaryContext(), Inst)) | ||
| continue; | ||
|
|
||
| const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst); | ||
|
|
||
| // Span of +/-128MB. | ||
| if (BitsAvailable == LongestJumpBits) | ||
| continue; | ||
|
|
||
| const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst); | ||
| BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol); | ||
| assert(TargetBB && | ||
| "Basic block target expected for conditional branch."); | ||
|
|
||
| // Check if the relaxation is needed. | ||
| if (TargetBB->getFragmentNum() == FF.getFragmentNum() && | ||
| isBlockInRange(Inst, InstAddress, *TargetBB)) | ||
| continue; | ||
|
|
||
| relaxBranch(BB, Inst, InstAddress, TargetBB); | ||
|
|
||
| MayNeedRelaxation = true; | ||
| } | ||
| } | ||
|
|
||
| // We may have added new instructions, but the whole fragment is less than | ||
| // the minimum branch span. | ||
| if (FragmentSize < ShortestJumpSpan) | ||
| MayNeedRelaxation = false; | ||
|
|
||
| } while (MayNeedRelaxation); | ||
|
|
||
| LLVM_DEBUG({ | ||
| if (NumIterations > 2) { | ||
| dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get() | ||
| << " of " << BF << " in " << NumIterations << " iterations\n"; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| // Add trampoline blocks from all fragments to the layout. | ||
| DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>> | ||
| Insertions; | ||
| for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair : | ||
| FunctionTrampolines) { | ||
| if (!Pair.second) | ||
| continue; | ||
| Insertions[Pair.first].emplace_back(std::move(Pair.second)); | ||
| } | ||
|
|
||
| for (auto &Pair : Insertions) { | ||
| BF.insertBasicBlocks(Pair.first, std::move(Pair.second), | ||
| /*UpdateLayout*/ true, /*UpdateCFI*/ true, | ||
| /*RecomputeLPs*/ false); | ||
| } | ||
| } | ||
|
|
||
| Error LongJmpPass::runOnFunctions(BinaryContext &BC) { | ||
|
|
||
| if (opts::CompactCodeModel) { | ||
| BC.outs() | ||
| << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n"; | ||
|
|
||
| ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { | ||
| relaxLocalBranches(BF); | ||
| }; | ||
|
|
||
| ParallelUtilities::PredicateTy SkipPredicate = | ||
| [&](const BinaryFunction &BF) { | ||
| return !BC.shouldEmit(BF) || !BF.isSimple(); | ||
| }; | ||
|
|
||
| ParallelUtilities::runOnEachFunction( | ||
| BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, | ||
| SkipPredicate, "RelaxLocalBranches"); | ||
|
|
||
| return Error::success(); | ||
| } | ||
|
|
||
| BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n"; | ||
| std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions(); | ||
| bool Modified; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit