@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218218 " block placement." ),
219219 cl::init(UINT_MAX), cl::Hidden);
220220
221+ // Apply the ext-tsp algorithm minimizing the size of a binary.
222+ static cl::opt<bool >
223+ ApplyExtTspForSize (" apply-ext-tsp-for-size" , cl::init(false ), cl::Hidden,
224+ cl::desc(" Use ext-tsp for size-aware block placement." ));
225+
221226namespace llvm {
222227extern cl::opt<bool > EnableExtTspBlockPlacement;
223228extern cl::opt<bool > ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595600 void precomputeTriangleChains ();
596601
597602 // / Apply a post-processing step optimizing block placement.
598- void applyExtTsp ();
603+ void applyExtTsp (bool OptForSize );
599604
600605 // / Modify the existing block placement in the function and adjust all jumps.
601606 void assignBlockOrder (const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,36 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35053510 // Initialize tail duplication thresholds.
35063511 initTailDupThreshold ();
35073512
3513+ const bool OptForSize =
3514+ MF.getFunction ().hasOptSize () ||
3515+ llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3516+ // Determine whether to use ext-tsp for perf/size optimization. The method
3517+ // is beneficial only for instances with at least 3 basic blocks and it can be
3518+ // disabled for huge functions (exceeding a certain size).
3519+ bool UseExtTspForPerf = false ;
3520+ bool UseExtTspForSize = false ;
3521+ if (3 <= MF.size () && MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3522+ UseExtTspForPerf =
3523+ EnableExtTspBlockPlacement &&
3524+ (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ());
3525+ UseExtTspForSize = OptForSize && ApplyExtTspForSize;
3526+ }
3527+
35083528 // Apply tail duplication.
35093529 if (allowTailDupPlacement ()) {
35103530 MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree ();
3511- bool OptForSize = MF.getFunction ().hasOptSize () ||
3512- llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
35133531 if (OptForSize)
35143532 TailDupSize = 1 ;
35153533 const bool PreRegAlloc = false ;
35163534 TailDup.initMF (MF, PreRegAlloc, MBPI, MBFI.get (), PSI,
35173535 /* LayoutMode */ true , TailDupSize);
3518- precomputeTriangleChains ();
3536+ if (!UseExtTspForSize)
3537+ precomputeTriangleChains ();
35193538 }
35203539
3521- buildCFGChains ();
3540+ // Run the main block placement.
3541+ if (!UseExtTspForSize)
3542+ buildCFGChains ();
35223543
35233544 // Changing the layout can create new tail merging opportunities.
35243545 // TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,14 +3566,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35453566 }
35463567 }
35473568
3548- // Apply a post-processing optimizing block placement.
3549- if (MF. size () >= 3 && EnableExtTspBlockPlacement &&
3550- (ApplyExtTspWithoutProfile || MF. getFunction (). hasProfileData ()) &&
3551- MF. size () <= ExtTspBlockPlacementMaxBlocks ) {
3552- // Find a new placement and modify the layout of the blocks in the function.
3553- applyExtTsp ();
3554-
3555- // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3569+ // Apply a post-processing optimizing block placement:
3570+ // - find a new placement and modify the layout of the blocks in the function;
3571+ // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3572+ if (UseExtTspForPerf || UseExtTspForSize ) {
3573+ assert (
3574+ !(UseExtTspForPerf && UseExtTspForSize) &&
3575+ " UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly " );
3576+ applyExtTsp ( /* OptForSize= */ UseExtTspForSize);
35563577 createCFGChainExtTsp ();
35573578 }
35583579
@@ -3577,7 +3598,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35773598 return true ;
35783599}
35793600
3580- void MachineBlockPlacement::applyExtTsp () {
3601+ void MachineBlockPlacement::applyExtTsp (bool OptForSize ) {
35813602 // Prepare data; blocks are indexed by their index in the current ordering.
35823603 DenseMap<const MachineBasicBlock *, uint64_t > BlockIndex;
35833604 BlockIndex.reserve (F->size ());
@@ -3589,13 +3610,15 @@ void MachineBlockPlacement::applyExtTsp() {
35893610 CurrentBlockOrder.push_back (&MBB);
35903611 }
35913612
3592- auto BlockSizes = std::vector<uint64_t >(F->size ());
3593- auto BlockCounts = std::vector<uint64_t >(F->size ());
3594- std::vector<codelayout::EdgeCount> JumpCounts;
3613+ SmallVector<uint64_t , 0 > BlockCounts (F->size ());
3614+ SmallVector<uint64_t , 0 > BlockSizes (F->size ());
3615+ SmallVector<codelayout::EdgeCount, 0 > JumpCounts;
3616+ SmallVector<MachineOperand, 4 > Cond; // For analyzeBranch.
3617+ SmallVector<const MachineBasicBlock *, 4 > Succs;
35953618 for (MachineBasicBlock &MBB : *F) {
35963619 // Getting the block frequency.
35973620 BlockFrequency BlockFreq = MBFI->getBlockFreq (&MBB);
3598- BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency ();
3621+ BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency ();
35993622 // Getting the block size:
36003623 // - approximate the size of an instruction by 4 bytes, and
36013624 // - ignore debug instructions.
@@ -3604,23 +3627,49 @@ void MachineBlockPlacement::applyExtTsp() {
36043627 // not see a perf improvement with the exact block sizes.
36053628 auto NonDbgInsts =
36063629 instructionsWithoutDebug (MBB.instr_begin (), MBB.instr_end ());
3607- int NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3630+ size_t NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
36083631 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
3632+
36093633 // Getting jump frequencies.
3610- for (MachineBasicBlock *Succ : MBB.successors ()) {
3611- auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3612- BlockFrequency JumpFreq = BlockFreq * EP;
3613- JumpCounts.push_back (
3614- {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3634+ if (OptForSize) {
3635+ Cond.clear ();
3636+ MachineBasicBlock *TBB = nullptr , *FBB = nullptr ; // For analyzeBranch.
3637+ if (TII->analyzeBranch (MBB, TBB, FBB, Cond))
3638+ continue ;
3639+
3640+ const MachineBasicBlock *FTB = MBB.getFallThrough ();
3641+ // Succs is a collection of distinct destinations of the block reachable
3642+ // from MBB via a jump instruction; initialize the list using the three
3643+ // (non-necessarily distinct) blocks, FTB, TBB, and FBB.
3644+ Succs.clear ();
3645+ if (TBB && TBB != FTB)
3646+ Succs.push_back (TBB);
3647+ if (FBB && FBB != FTB)
3648+ Succs.push_back (FBB);
3649+ if (FTB)
3650+ Succs.push_back (FTB);
3651+ // Absolute magnitude of non-zero counts does not matter for the
3652+ // optimization; prioritize slightly jumps with a single successor, since
3653+ // the corresponding jump instruction will be removed from the binary.
3654+ const uint64_t Freq = Succs.size () == 1 ? 110 : 100 ;
3655+ for (const MachineBasicBlock *Succ : Succs)
3656+ JumpCounts.push_back ({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3657+ } else {
3658+ for (MachineBasicBlock *Succ : MBB.successors ()) {
3659+ auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3660+ BlockFrequency JumpFreq = BlockFreq * EP;
3661+ JumpCounts.push_back (
3662+ {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3663+ }
36153664 }
36163665 }
36173666
36183667 LLVM_DEBUG (dbgs () << " Applying ext-tsp layout for |V| = " << F->size ()
36193668 << " with profile = " << F->getFunction ().hasProfileData ()
3620- << " (" << F->getName (). str () << " )"
3621- << " \n " );
3622- LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " ,
3623- calcExtTspScore (BlockSizes, JumpCounts) ));
3669+ << " (" << F->getName () << " )" << " \n " );
3670+
3671+ const double OrgScore = calcExtTspScore (BlockSizes, JumpCounts);
3672+ LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " , OrgScore ));
36243673
36253674 // Run the layout algorithm.
36263675 auto NewOrder = computeExtTspLayout (BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3678,14 @@ void MachineBlockPlacement::applyExtTsp() {
36293678 for (uint64_t Node : NewOrder) {
36303679 NewBlockOrder.push_back (CurrentBlockOrder[Node]);
36313680 }
3632- LLVM_DEBUG (
3633- dbgs () << format (" optimized layout score: %0.2f\n " ,
3634- calcExtTspScore (NewOrder, BlockSizes, JumpCounts)));
3681+ const double OptScore = calcExtTspScore (NewOrder, BlockSizes, JumpCounts);
3682+ LLVM_DEBUG (dbgs () << format (" optimized layout score: %0.2f\n " , OptScore));
36353683
3636- // Assign new block order.
3637- assignBlockOrder (NewBlockOrder);
3684+ // If the optimization is unsuccessful, fall back to the original block order.
3685+ if (OptForSize && OrgScore > OptScore)
3686+ assignBlockOrder (CurrentBlockOrder);
3687+ else
3688+ assignBlockOrder (NewBlockOrder);
36383689}
36393690
36403691void MachineBlockPlacement::assignBlockOrder (
0 commit comments