Skip to content

Commit 9c5b340

Browse files
committed
[BOLT][AArch64] Add support for compact code model
Add `--compact-code-model` option that executes alternative branch relaxation with an assumption that the resulting binary has less than 128MB of code. The relaxation is done in `relaxLocalBranches()`, which operates on a function level and executes on multiple functions in parallel. Running the new pass on AArch64 Clang binary produces slightly smaller code and finishes in about 1/10th of the time. Note that the new .text has to be smaller than 128MB, *and* .plt has to be closer than 128MB to the new code.
1 parent 6d4edf2 commit 9c5b340

File tree

6 files changed

+344
-5
lines changed

6 files changed

+344
-5
lines changed

bolt/include/bolt/Core/BinaryBasicBlock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,9 @@ class BinaryBasicBlock {
819819
return OutputAddressRange;
820820
}
821821

822+
uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
823+
uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }
824+
822825
bool hasLocSyms() const { return LocSyms != nullptr; }
823826

824827
/// Return mapping of input offsets to symbols in the output.

bolt/include/bolt/Core/FunctionLayout.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ class FunctionFragment {
123123
const_iterator begin() const;
124124
iterator end();
125125
const_iterator end() const;
126-
const BinaryBasicBlock *front() const;
126+
BinaryBasicBlock *front() const;
127+
BinaryBasicBlock *back() const;
127128

128129
friend class FunctionLayout;
129130
};

bolt/include/bolt/Passes/LongJmp.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
6363
uint32_t NumColdStubs{0};
6464
uint32_t NumSharedStubs{0};
6565

66+
/// The shortest distance for any branch instruction on AArch64.
67+
static constexpr size_t ShortestJumpBits = 16;
68+
static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);
69+
70+
/// The longest single-instruction branch.
71+
static constexpr size_t LongestJumpBits = 28;
72+
static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);
73+
74+
/// Relax all internal function branches including those between fragments.
75+
/// Assume that fragments are placed in different sections but are within
76+
/// 128MB of each other.
77+
void relaxLocalBranches(BinaryFunction &BF);
78+
6679
/// -- Layout estimation methods --
6780
/// Try to do layout before running the emitter, by looking at BinaryFunctions
6881
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter

bolt/lib/Core/FunctionLayout.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
3333
return const_iterator(Layout->block_begin() + StartIndex + Size);
3434
}
3535

36-
const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
36+
BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
37+
38+
BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }
3739

3840
FunctionLayout::FunctionLayout() { addFragment(); }
3941

bolt/lib/Passes/LongJmp.cpp

Lines changed: 275 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,26 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "bolt/Passes/LongJmp.h"
14+
#include "bolt/Core/ParallelUtilities.h"
15+
#include "llvm/Support/MathExtras.h"
1416

1517
#define DEBUG_TYPE "longjmp"
1618

1719
using namespace llvm;
1820

1921
namespace opts {
22+
extern cl::OptionCategory BoltCategory;
2023
extern cl::OptionCategory BoltOptCategory;
2124
extern llvm::cl::opt<unsigned> AlignText;
2225
extern cl::opt<unsigned> AlignFunctions;
2326
extern cl::opt<bool> UseOldText;
2427
extern cl::opt<bool> HotFunctionsAtEnd;
2528

29+
static cl::opt<bool>
30+
CompactCodeModel("compact-code-model",
31+
cl::desc("generate code for binaries <128MB on AArch64"),
32+
cl::init(false), cl::cat(BoltCategory));
33+
2634
static cl::opt<bool> GroupStubs("group-stubs",
2735
cl::desc("share stubs across functions"),
2836
cl::init(true), cl::cat(BoltOptCategory));
@@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
6169
if (Next != E && (*Next)->isCold())
6270
return *I;
6371
}
64-
llvm_unreachable("No hot-colt split point found");
72+
llvm_unreachable("No hot-cold split point found");
6573
}
6674

67-
static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
75+
static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
6876
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
6977
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
7078
}
@@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
565573
if (BC.MIB->isPseudo(Inst))
566574
continue;
567575

568-
if (!shouldInsertStub(BC, Inst)) {
576+
if (!mayNeedStub(BC, Inst)) {
569577
DotAddress += InsnSize;
570578
continue;
571579
}
@@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
629637
return Error::success();
630638
}
631639

640+
void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
641+
BinaryContext &BC = BF.getBinaryContext();
642+
auto &MIB = BC.MIB;
643+
644+
if (!BF.isSimple())
645+
return;
646+
647+
// Quick path.
648+
if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
649+
return;
650+
651+
auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
652+
const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
653+
return isIntN(Bits, Offset);
654+
};
655+
656+
auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
657+
const BinaryBasicBlock &BB) {
658+
const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
659+
return isBranchOffsetInRange(Inst, Offset);
660+
};
661+
662+
// Keep track of *all* function trampolines that are going to be added to the
663+
// function layout at the end of relaxation.
664+
std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
665+
FunctionTrampolines;
666+
667+
// Function fragments are relaxed independently.
668+
for (FunctionFragment &FF : BF.getLayout().fragments()) {
669+
// Fill out code size estimation for the fragment. Use output BB address
670+
// ranges to store offsets from the start of the function.
671+
uint64_t CodeSize = 0;
672+
for (BinaryBasicBlock *BB : FF) {
673+
BB->setOutputStartAddress(CodeSize);
674+
CodeSize += BB->estimateSize();
675+
BB->setOutputEndAddress(CodeSize);
676+
}
677+
678+
// Dynamically-updated size of the fragment.
679+
uint64_t FragmentSize = CodeSize;
680+
681+
// Size of the trampoline in bytes.
682+
constexpr uint64_t TrampolineSize = 4;
683+
684+
// Trampolines created for the fragment. DestinationBB -> TrampolineBB.
685+
// NB: here we store only the first trampoline created for DestinationBB.
686+
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;
687+
688+
// Create a trampoline code after \p BB or at the end of the fragment if BB
689+
// is nullptr.
690+
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
691+
BinaryBasicBlock *TargetBB, uint64_t Count,
692+
bool UpdateOffsets = true) {
693+
std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
694+
MCInst Inst;
695+
{
696+
auto L = BC.scopeLock();
697+
MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
698+
}
699+
TrampolineBB->addInstruction(Inst);
700+
TrampolineBB->addSuccessor(TargetBB, Count);
701+
TrampolineBB->setExecutionCount(Count);
702+
const uint64_t TrampolineAddress =
703+
BB ? BB->getOutputEndAddress() : FragmentSize;
704+
TrampolineBB->setOutputStartAddress(TrampolineAddress);
705+
TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
706+
TrampolineBB->setFragmentNum(FF.getFragmentNum());
707+
708+
if (UpdateOffsets) {
709+
FragmentSize += TrampolineSize;
710+
for (BinaryBasicBlock *IBB : FF) {
711+
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
712+
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
713+
TrampolineSize);
714+
IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
715+
TrampolineSize);
716+
}
717+
}
718+
for (auto &Pair : FunctionTrampolines) {
719+
BinaryBasicBlock *IBB = Pair.second.get();
720+
if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
721+
continue;
722+
if (IBB == TrampolineBB.get())
723+
continue;
724+
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
725+
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
726+
TrampolineSize);
727+
IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
728+
TrampolineSize);
729+
}
730+
}
731+
}
732+
733+
if (!FragmentTrampolines.lookup(TargetBB))
734+
FragmentTrampolines[TargetBB] = TrampolineBB.get();
735+
FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
736+
std::move(TrampolineBB));
737+
738+
return FunctionTrampolines.back().second.get();
739+
};
740+
741+
// Pre-populate trampolines by splitting unconditional branches from the
742+
// containing basic block.
743+
for (BinaryBasicBlock *BB : FF) {
744+
MCInst *Inst = BB->getLastNonPseudoInstr();
745+
if (!Inst || !MIB->isUnconditionalBranch(*Inst))
746+
continue;
747+
748+
const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
749+
BB->eraseInstruction(BB->findInstruction(Inst));
750+
BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);
751+
752+
BinaryBasicBlock::BinaryBranchInfo BI;
753+
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);
754+
755+
BinaryBasicBlock *TrampolineBB =
756+
addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
757+
BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
758+
}
759+
760+
/// Relax the branch \p Inst. Return true if basic block offsets need an
761+
/// update after the trampoline insertion.
762+
auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
763+
uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
764+
BinaryFunction *BF = BB->getParent();
765+
766+
// Use branch taken count for optimal relaxation.
767+
const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
768+
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
769+
"Expected valid branch execution count");
770+
771+
// Try to reuse an existing trampoline without introducing any new code.
772+
BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
773+
if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
774+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
775+
TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
776+
Count);
777+
auto L = BC.scopeLock();
778+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
779+
return;
780+
}
781+
782+
// For cold branches, check if we can introduce a trampoline at the end
783+
// of the fragment that is within the branch reach. Note that such
784+
// trampoline may change address later and become unreachable in which
785+
// case we will need further relaxation.
786+
const int64_t OffsetToEnd = FragmentSize - InstAddress;
787+
if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
788+
TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
789+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
790+
auto L = BC.scopeLock();
791+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
792+
793+
return;
794+
}
795+
796+
// Insert a new block after the current one and use it as a trampoline.
797+
TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);
798+
799+
// If the other successor is a fall-through, invert the condition code.
800+
const BinaryBasicBlock *const NextBB =
801+
BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
802+
if (BB->getConditionalSuccessor(false) == NextBB) {
803+
BB->swapConditionalSuccessors();
804+
auto L = BC.scopeLock();
805+
MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
806+
} else {
807+
auto L = BC.scopeLock();
808+
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
809+
}
810+
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
811+
};
812+
813+
bool MayNeedRelaxation;
814+
uint64_t NumIterations = 0;
815+
do {
816+
MayNeedRelaxation = false;
817+
++NumIterations;
818+
for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
819+
BinaryBasicBlock *BB = *BBI;
820+
uint64_t NextInstOffset = BB->getOutputStartAddress();
821+
for (MCInst &Inst : *BB) {
822+
const size_t InstAddress = NextInstOffset;
823+
if (!MIB->isPseudo(Inst))
824+
NextInstOffset += 4;
825+
826+
if (!mayNeedStub(BF.getBinaryContext(), Inst))
827+
continue;
828+
829+
const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);
830+
831+
// Span of +/-128MB.
832+
if (BitsAvailable == LongestJumpBits)
833+
continue;
834+
835+
const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
836+
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
837+
assert(TargetBB &&
838+
"Basic block target expected for conditional branch.");
839+
840+
// Check if the relaxation is needed.
841+
if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
842+
isBlockInRange(Inst, InstAddress, *TargetBB))
843+
continue;
844+
845+
relaxBranch(BB, Inst, InstAddress, TargetBB);
846+
847+
MayNeedRelaxation = true;
848+
}
849+
}
850+
851+
// We may have added new instructions, but the whole fragment is less than
852+
// the minimum branch span.
853+
if (FragmentSize < ShortestJumpSpan)
854+
MayNeedRelaxation = false;
855+
856+
} while (MayNeedRelaxation);
857+
858+
LLVM_DEBUG({
859+
if (NumIterations > 2) {
860+
dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
861+
<< " of " << BF << " in " << NumIterations << " iterations\n";
862+
}
863+
});
864+
}
865+
866+
// Add trampoline blocks from all fragments to the layout.
867+
DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
868+
Insertions;
869+
for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
870+
FunctionTrampolines) {
871+
if (!Pair.second)
872+
continue;
873+
Insertions[Pair.first].emplace_back(std::move(Pair.second));
874+
}
875+
876+
for (auto &Pair : Insertions) {
877+
BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
878+
/*UpdateLayout*/ true, /*UpdateCFI*/ true,
879+
/*RecomputeLPs*/ false);
880+
}
881+
}
882+
632883
Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
884+
885+
if (opts::CompactCodeModel) {
886+
BC.outs()
887+
<< "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
888+
889+
ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
890+
relaxLocalBranches(BF);
891+
};
892+
893+
ParallelUtilities::PredicateTy SkipPredicate =
894+
[&](const BinaryFunction &BF) {
895+
return !BC.shouldEmit(BF) || !BF.isSimple();
896+
};
897+
898+
ParallelUtilities::runOnEachFunction(
899+
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
900+
SkipPredicate, "RelaxLocalBranches");
901+
902+
return Error::success();
903+
}
904+
633905
BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
634906
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
635907
bool Modified;

0 commit comments

Comments
 (0)