|
11 | 11 | //===----------------------------------------------------------------------===// |
12 | 12 |
|
13 | 13 | #include "bolt/Passes/LongJmp.h" |
| 14 | +#include "bolt/Core/ParallelUtilities.h" |
| 15 | +#include "llvm/Support/MathExtras.h" |
14 | 16 |
|
15 | 17 | #define DEBUG_TYPE "longjmp" |
16 | 18 |
|
17 | 19 | using namespace llvm; |
18 | 20 |
|
19 | 21 | namespace opts { |
| 22 | +extern cl::OptionCategory BoltCategory; |
20 | 23 | extern cl::OptionCategory BoltOptCategory; |
21 | 24 | extern llvm::cl::opt<unsigned> AlignText; |
22 | 25 | extern cl::opt<unsigned> AlignFunctions; |
23 | 26 | extern cl::opt<bool> UseOldText; |
24 | 27 | extern cl::opt<bool> HotFunctionsAtEnd; |
25 | 28 |
|
| 29 | +static cl::opt<bool> |
| 30 | + CompactCodeModel("compact-code-model", |
| 31 | + cl::desc("generate code for binaries <128MB on AArch64"), |
| 32 | + cl::init(false), cl::cat(BoltCategory)); |
| 33 | + |
26 | 34 | static cl::opt<bool> GroupStubs("group-stubs", |
27 | 35 | cl::desc("share stubs across functions"), |
28 | 36 | cl::init(true), cl::cat(BoltOptCategory)); |
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) { |
61 | 69 | if (Next != E && (*Next)->isCold()) |
62 | 70 | return *I; |
63 | 71 | } |
64 | | - llvm_unreachable("No hot-colt split point found"); |
| 72 | + llvm_unreachable("No hot-cold split point found"); |
65 | 73 | } |
66 | 74 |
|
67 | | -static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) { |
| 75 | +static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) { |
68 | 76 | return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) && |
69 | 77 | !BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst); |
70 | 78 | } |
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { |
565 | 573 | if (BC.MIB->isPseudo(Inst)) |
566 | 574 | continue; |
567 | 575 |
|
568 | | - if (!shouldInsertStub(BC, Inst)) { |
| 576 | + if (!mayNeedStub(BC, Inst)) { |
569 | 577 | DotAddress += InsnSize; |
570 | 578 | continue; |
571 | 579 | } |
@@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) { |
629 | 637 | return Error::success(); |
630 | 638 | } |
631 | 639 |
|
| 640 | +void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) { |
| 641 | + BinaryContext &BC = BF.getBinaryContext(); |
| 642 | + auto &MIB = BC.MIB; |
| 643 | + |
| 644 | + if (!BF.isSimple()) |
| 645 | + return; |
| 646 | + |
| 647 | + // Quick path. |
| 648 | + if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan) |
| 649 | + return; |
| 650 | + |
| 651 | + auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) { |
| 652 | + const unsigned Bits = MIB->getPCRelEncodingSize(Inst); |
| 653 | + return isIntN(Bits, Offset); |
| 654 | + }; |
| 655 | + |
| 656 | + auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress, |
| 657 | + const BinaryBasicBlock &BB) { |
| 658 | + const int64_t Offset = BB.getOutputStartAddress() - InstAddress; |
| 659 | + return isBranchOffsetInRange(Inst, Offset); |
| 660 | + }; |
| 661 | + |
| 662 | + // Keep track of *all* function trampolines that are going to be added to the |
| 663 | + // function layout at the end of relaxation. |
| 664 | + std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>> |
| 665 | + FunctionTrampolines; |
| 666 | + |
| 667 | + // Function fragments are relaxed independently. |
| 668 | + for (FunctionFragment &FF : BF.getLayout().fragments()) { |
| 669 | + // Fill out code size estimation for the fragment. Use output BB address |
| 670 | + // ranges to store offsets from the start of the function. |
| 671 | + uint64_t CodeSize = 0; |
| 672 | + for (BinaryBasicBlock *BB : FF) { |
| 673 | + BB->setOutputStartAddress(CodeSize); |
| 674 | + CodeSize += BB->estimateSize(); |
| 675 | + BB->setOutputEndAddress(CodeSize); |
| 676 | + } |
| 677 | + |
| 678 | + // Dynamically-updated size of the fragment. |
| 679 | + uint64_t FragmentSize = CodeSize; |
| 680 | + |
| 681 | + // Size of the trampoline in bytes. |
| 682 | + constexpr uint64_t TrampolineSize = 4; |
| 683 | + |
| 684 | + // Trampolines created for the fragment. DestinationBB -> TrampolineBB. |
| 685 | + // NB: here we store only the first trampoline created for DestinationBB. |
| 686 | + DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines; |
| 687 | + |
| 688 | + // Create a trampoline code after \p BB or at the end of the fragment if BB |
| 689 | + // is nullptr. |
| 690 | + auto addTrampolineAfter = [&](BinaryBasicBlock *BB, |
| 691 | + BinaryBasicBlock *TargetBB, uint64_t Count, |
| 692 | + bool UpdateOffsets = true) { |
| 693 | + std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock(); |
| 694 | + MCInst Inst; |
| 695 | + { |
| 696 | + auto L = BC.scopeLock(); |
| 697 | + MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get()); |
| 698 | + } |
| 699 | + TrampolineBB->addInstruction(Inst); |
| 700 | + TrampolineBB->addSuccessor(TargetBB, Count); |
| 701 | + TrampolineBB->setExecutionCount(Count); |
| 702 | + const uint64_t TrampolineAddress = |
| 703 | + BB ? BB->getOutputEndAddress() : FragmentSize; |
| 704 | + TrampolineBB->setOutputStartAddress(TrampolineAddress); |
| 705 | + TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize); |
| 706 | + TrampolineBB->setFragmentNum(FF.getFragmentNum()); |
| 707 | + |
| 708 | + if (UpdateOffsets) { |
| 709 | + FragmentSize += TrampolineSize; |
| 710 | + for (BinaryBasicBlock *IBB : FF) { |
| 711 | + if (IBB->getOutputStartAddress() >= TrampolineAddress) { |
| 712 | + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + |
| 713 | + TrampolineSize); |
| 714 | + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + |
| 715 | + TrampolineSize); |
| 716 | + } |
| 717 | + } |
| 718 | + for (auto &Pair : FunctionTrampolines) { |
| 719 | + BinaryBasicBlock *IBB = Pair.second.get(); |
| 720 | + if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum()) |
| 721 | + continue; |
| 722 | + if (IBB == TrampolineBB.get()) |
| 723 | + continue; |
| 724 | + if (IBB->getOutputStartAddress() >= TrampolineAddress) { |
| 725 | + IBB->setOutputStartAddress(IBB->getOutputStartAddress() + |
| 726 | + TrampolineSize); |
| 727 | + IBB->setOutputEndAddress(IBB->getOutputEndAddress() + |
| 728 | + TrampolineSize); |
| 729 | + } |
| 730 | + } |
| 731 | + } |
| 732 | + |
| 733 | + if (!FragmentTrampolines.lookup(TargetBB)) |
| 734 | + FragmentTrampolines[TargetBB] = TrampolineBB.get(); |
| 735 | + FunctionTrampolines.emplace_back(BB ? BB : FF.back(), |
| 736 | + std::move(TrampolineBB)); |
| 737 | + |
| 738 | + return FunctionTrampolines.back().second.get(); |
| 739 | + }; |
| 740 | + |
| 741 | + // Pre-populate trampolines by splitting unconditional branches from the |
| 742 | + // containing basic block. |
| 743 | + for (BinaryBasicBlock *BB : FF) { |
| 744 | + MCInst *Inst = BB->getLastNonPseudoInstr(); |
| 745 | + if (!Inst || !MIB->isUnconditionalBranch(*Inst)) |
| 746 | + continue; |
| 747 | + |
| 748 | + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst); |
| 749 | + BB->eraseInstruction(BB->findInstruction(Inst)); |
| 750 | + BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize); |
| 751 | + |
| 752 | + BinaryBasicBlock::BinaryBranchInfo BI; |
| 753 | + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI); |
| 754 | + |
| 755 | + BinaryBasicBlock *TrampolineBB = |
| 756 | + addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false); |
| 757 | + BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count); |
| 758 | + } |
| 759 | + |
| 760 | + /// Relax the branch \p Inst. Return true if basic block offsets need an |
| 761 | + /// update after the trampoline insertion. |
| 762 | + auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst, |
| 763 | + uint64_t InstAddress, BinaryBasicBlock *TargetBB) { |
| 764 | + BinaryFunction *BF = BB->getParent(); |
| 765 | + |
| 766 | + // Use branch taken count for optimal relaxation. |
| 767 | + const uint64_t Count = BB->getBranchInfo(*TargetBB).Count; |
| 768 | + assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE && |
| 769 | + "Expected valid branch execution count"); |
| 770 | + |
| 771 | + // Try to reuse an existing trampoline without introducing any new code. |
| 772 | + BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB); |
| 773 | + if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) { |
| 774 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 775 | + TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() + |
| 776 | + Count); |
| 777 | + auto L = BC.scopeLock(); |
| 778 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 779 | + return; |
| 780 | + } |
| 781 | + |
| 782 | + // For cold branches, check if we can introduce a trampoline at the end |
| 783 | + // of the fragment that is within the branch reach. Note that such |
| 784 | + // trampoline may change address later and become unreachable in which |
| 785 | + // case we will need further relaxation. |
| 786 | + const int64_t OffsetToEnd = FragmentSize - InstAddress; |
| 787 | + if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) { |
| 788 | + TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count); |
| 789 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 790 | + auto L = BC.scopeLock(); |
| 791 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 792 | + |
| 793 | + return; |
| 794 | + } |
| 795 | + |
| 796 | + // Insert a new block after the current one and use it as a trampoline. |
| 797 | + TrampolineBB = addTrampolineAfter(BB, TargetBB, Count); |
| 798 | + |
| 799 | + // If the other successor is a fall-through, invert the condition code. |
| 800 | + const BinaryBasicBlock *const NextBB = |
| 801 | + BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false); |
| 802 | + if (BB->getConditionalSuccessor(false) == NextBB) { |
| 803 | + BB->swapConditionalSuccessors(); |
| 804 | + auto L = BC.scopeLock(); |
| 805 | + MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get()); |
| 806 | + } else { |
| 807 | + auto L = BC.scopeLock(); |
| 808 | + MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get()); |
| 809 | + } |
| 810 | + BB->replaceSuccessor(TargetBB, TrampolineBB, Count); |
| 811 | + }; |
| 812 | + |
| 813 | + bool MayNeedRelaxation; |
| 814 | + uint64_t NumIterations = 0; |
| 815 | + do { |
| 816 | + MayNeedRelaxation = false; |
| 817 | + ++NumIterations; |
| 818 | + for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) { |
| 819 | + BinaryBasicBlock *BB = *BBI; |
| 820 | + uint64_t NextInstOffset = BB->getOutputStartAddress(); |
| 821 | + for (MCInst &Inst : *BB) { |
| 822 | + const size_t InstAddress = NextInstOffset; |
| 823 | + if (!MIB->isPseudo(Inst)) |
| 824 | + NextInstOffset += 4; |
| 825 | + |
| 826 | + if (!mayNeedStub(BF.getBinaryContext(), Inst)) |
| 827 | + continue; |
| 828 | + |
| 829 | + const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst); |
| 830 | + |
| 831 | + // Span of +/-128MB. |
| 832 | + if (BitsAvailable == LongestJumpBits) |
| 833 | + continue; |
| 834 | + |
| 835 | + const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst); |
| 836 | + BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol); |
| 837 | + assert(TargetBB && |
| 838 | + "Basic block target expected for conditional branch."); |
| 839 | + |
| 840 | + // Check if the relaxation is needed. |
| 841 | + if (TargetBB->getFragmentNum() == FF.getFragmentNum() && |
| 842 | + isBlockInRange(Inst, InstAddress, *TargetBB)) |
| 843 | + continue; |
| 844 | + |
| 845 | + relaxBranch(BB, Inst, InstAddress, TargetBB); |
| 846 | + |
| 847 | + MayNeedRelaxation = true; |
| 848 | + } |
| 849 | + } |
| 850 | + |
| 851 | + // We may have added new instructions, but the whole fragment is less than |
| 852 | + // the minimum branch span. |
| 853 | + if (FragmentSize < ShortestJumpSpan) |
| 854 | + MayNeedRelaxation = false; |
| 855 | + |
| 856 | + } while (MayNeedRelaxation); |
| 857 | + |
| 858 | + LLVM_DEBUG({ |
| 859 | + if (NumIterations > 2) { |
| 860 | + dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get() |
| 861 | + << " of " << BF << " in " << NumIterations << " iterations\n"; |
| 862 | + } |
| 863 | + }); |
| 864 | + } |
| 865 | + |
| 866 | + // Add trampoline blocks from all fragments to the layout. |
| 867 | + DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>> |
| 868 | + Insertions; |
| 869 | + for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair : |
| 870 | + FunctionTrampolines) { |
| 871 | + if (!Pair.second) |
| 872 | + continue; |
| 873 | + Insertions[Pair.first].emplace_back(std::move(Pair.second)); |
| 874 | + } |
| 875 | + |
| 876 | + for (auto &Pair : Insertions) { |
| 877 | + BF.insertBasicBlocks(Pair.first, std::move(Pair.second), |
| 878 | + /*UpdateLayout*/ true, /*UpdateCFI*/ true, |
| 879 | + /*RecomputeLPs*/ false); |
| 880 | + } |
| 881 | +} |
| 882 | + |
632 | 883 | Error LongJmpPass::runOnFunctions(BinaryContext &BC) { |
| 884 | + |
| 885 | + if (opts::CompactCodeModel) { |
| 886 | + BC.outs() |
| 887 | + << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n"; |
| 888 | + |
| 889 | + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { |
| 890 | + relaxLocalBranches(BF); |
| 891 | + }; |
| 892 | + |
| 893 | + ParallelUtilities::PredicateTy SkipPredicate = |
| 894 | + [&](const BinaryFunction &BF) { |
| 895 | + return !BC.shouldEmit(BF) || !BF.isSimple(); |
| 896 | + }; |
| 897 | + |
| 898 | + ParallelUtilities::runOnEachFunction( |
| 899 | + BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun, |
| 900 | + SkipPredicate, "RelaxLocalBranches"); |
| 901 | + |
| 902 | + return Error::success(); |
| 903 | + } |
| 904 | + |
633 | 905 | BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n"; |
634 | 906 | std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions(); |
635 | 907 | bool Modified; |
|
0 commit comments