@@ -218,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
218218 " block placement." ),
219219 cl::init(UINT_MAX), cl::Hidden);
220220
221+ // Apply the ext-tsp algorithm minimizing the size of a binary.
222+ static cl::opt<bool >
223+ ApplyExtTspForSize (" apply-ext-tsp-for-size" , cl::init(false ), cl::Hidden,
224+ cl::desc(" Use ext-tsp for size-aware block placement." ));
225+
221226namespace llvm {
222227extern cl::opt<bool > EnableExtTspBlockPlacement;
223228extern cl::opt<bool > ApplyExtTspWithoutProfile;
@@ -595,7 +600,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
595600 void precomputeTriangleChains ();
596601
597602 // / Apply a post-processing step optimizing block placement.
598- void applyExtTsp ();
603+ void applyExtTsp (bool OptForSize );
599604
600605 // / Modify the existing block placement in the function and adjust all jumps.
601606 void assignBlockOrder (const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -3505,20 +3510,29 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35053510 // Initialize tail duplication thresholds.
35063511 initTailDupThreshold ();
35073512
3513+ const bool OptForSize =
3514+ MF.getFunction ().hasOptSize () ||
3515+ llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
3516+ // Use ext-tsp for size optimization is possible only when the function
3517+ // contains more than two basic blocks.
3518+ const bool UseExtTspForSize =
3519+ OptForSize && ApplyExtTspForSize && MF.size () >= 3 ;
3520+
35083521 // Apply tail duplication.
35093522 if (allowTailDupPlacement ()) {
35103523 MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree ();
3511- bool OptForSize = MF.getFunction ().hasOptSize () ||
3512- llvm::shouldOptimizeForSize (&MF, PSI, &MBFI->getMBFI ());
35133524 if (OptForSize)
35143525 TailDupSize = 1 ;
35153526 const bool PreRegAlloc = false ;
35163527 TailDup.initMF (MF, PreRegAlloc, MBPI, MBFI.get (), PSI,
35173528 /* LayoutMode */ true , TailDupSize);
3518- precomputeTriangleChains ();
3529+ if (!UseExtTspForSize)
3530+ precomputeTriangleChains ();
35193531 }
35203532
3521- buildCFGChains ();
3533+ // Run the main block placement.
3534+ if (!UseExtTspForSize)
3535+ buildCFGChains ();
35223536
35233537 // Changing the layout can create new tail merging opportunities.
35243538 // TailMerge can create jump into if branches that make CFG irreducible for
@@ -3545,15 +3559,19 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35453559 }
35463560 }
35473561
3548- // Apply a post-processing optimizing block placement.
3549- if (MF.size () >= 3 && EnableExtTspBlockPlacement &&
3550- (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ()) &&
3551- MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3552- // Find a new placement and modify the layout of the blocks in the function.
3553- applyExtTsp ();
3554-
3555- // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
3556- createCFGChainExtTsp ();
3562+ // Apply a post-processing optimizing block placement:
3563+ // - find a new placement and modify the layout of the blocks in the function;
3564+ // - re-create CFG chains so that we can optimizeBranches and alignBlocks.
3565+ if (MF.size () >= 3 ) {
3566+ if (EnableExtTspBlockPlacement &&
3567+ (ApplyExtTspWithoutProfile || MF.getFunction ().hasProfileData ()) &&
3568+ MF.size () <= ExtTspBlockPlacementMaxBlocks) {
3569+ applyExtTsp (false );
3570+ createCFGChainExtTsp ();
3571+ } else if (UseExtTspForSize) {
3572+ applyExtTsp (true );
3573+ createCFGChainExtTsp ();
3574+ }
35573575 }
35583576
35593577 optimizeBranches ();
@@ -3577,7 +3595,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
35773595 return true ;
35783596}
35793597
3580- void MachineBlockPlacement::applyExtTsp () {
3598+ void MachineBlockPlacement::applyExtTsp (bool OptForSize ) {
35813599 // Prepare data; blocks are indexed by their index in the current ordering.
35823600 DenseMap<const MachineBasicBlock *, uint64_t > BlockIndex;
35833601 BlockIndex.reserve (F->size ());
@@ -3589,13 +3607,15 @@ void MachineBlockPlacement::applyExtTsp() {
35893607 CurrentBlockOrder.push_back (&MBB);
35903608 }
35913609
3592- auto BlockSizes = std::vector<uint64_t >(F->size ());
3593- auto BlockCounts = std::vector<uint64_t >(F->size ());
3610+ std::vector<uint64_t > BlockCounts (F->size ());
3611+ std::vector<uint64_t > BlockSizes (F->size ());
35943612 std::vector<codelayout::EdgeCount> JumpCounts;
3613+ SmallVector<MachineOperand, 4 > Cond; // For analyzeBranch.
3614+ SmallVector<const MachineBasicBlock *, 4 > Succs;
35953615 for (MachineBasicBlock &MBB : *F) {
35963616 // Getting the block frequency.
35973617 BlockFrequency BlockFreq = MBFI->getBlockFreq (&MBB);
3598- BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency ();
3618+ BlockCounts[BlockIndex[&MBB]] = OptForSize ? 1 : BlockFreq.getFrequency ();
35993619 // Getting the block size:
36003620 // - approximate the size of an instruction by 4 bytes, and
36013621 // - ignore debug instructions.
@@ -3604,23 +3624,48 @@ void MachineBlockPlacement::applyExtTsp() {
36043624 // not see a perf improvement with the exact block sizes.
36053625 auto NonDbgInsts =
36063626 instructionsWithoutDebug (MBB.instr_begin (), MBB.instr_end ());
3607- int NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
3627+ size_t NumInsts = std::distance (NonDbgInsts.begin (), NonDbgInsts.end ());
36083628 BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
36093629 // Getting jump frequencies.
3610- for (MachineBasicBlock *Succ : MBB.successors ()) {
3611- auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3612- BlockFrequency JumpFreq = BlockFreq * EP;
3613- JumpCounts.push_back (
3614- {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3630+
3631+ if (!OptForSize) {
3632+ for (MachineBasicBlock *Succ : MBB.successors ()) {
3633+ auto EP = MBPI->getEdgeProbability (&MBB, Succ);
3634+ BlockFrequency JumpFreq = BlockFreq * EP;
3635+ JumpCounts.push_back (
3636+ {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency ()});
3637+ }
3638+ } else {
3639+ Cond.clear ();
3640+ MachineBasicBlock *TBB = nullptr , *FBB = nullptr ; // For analyzeBranch.
3641+ if (TII->analyzeBranch (MBB, TBB, FBB, Cond))
3642+ continue ;
3643+
3644+ const MachineBasicBlock *FTB = MBB.getFallThrough ();
3645+
3646+ Succs.clear ();
3647+ if (TBB && TBB != FTB)
3648+ Succs.push_back (TBB);
3649+ if (FBB && FBB != FTB)
3650+ Succs.push_back (FBB);
3651+ if (FTB)
3652+ Succs.push_back (FTB);
3653+ // Absolute magnitude of non-zero counts does not matter for the
3654+ // optimization; prioritize slightly jumps with a single successor, since
3655+ // the corresponding jump instruction will be removed from the binary.
3656+ const uint64_t Freq = Succs.size () == 1 ? 110 : 100 ;
3657+ for (const MachineBasicBlock *Succ : Succs) {
3658+ JumpCounts.push_back ({BlockIndex[&MBB], BlockIndex[Succ], Freq});
3659+ }
36153660 }
36163661 }
36173662
36183663 LLVM_DEBUG (dbgs () << " Applying ext-tsp layout for |V| = " << F->size ()
36193664 << " with profile = " << F->getFunction ().hasProfileData ()
3620- << " (" << F->getName ().str () << " )"
3621- << " \n " );
3622- LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " ,
3623- calcExtTspScore (BlockSizes, JumpCounts) ));
3665+ << " (" << F->getName ().str () << " )" << " \n " );
3666+
3667+ const double OrgScore = calcExtTspScore (BlockSizes, BlockCounts, JumpCounts);
3668+ LLVM_DEBUG ( dbgs () << format ( " original layout score: %0.2f \n " , OrgScore ));
36243669
36253670 // Run the layout algorithm.
36263671 auto NewOrder = computeExtTspLayout (BlockSizes, BlockCounts, JumpCounts);
@@ -3629,12 +3674,15 @@ void MachineBlockPlacement::applyExtTsp() {
36293674 for (uint64_t Node : NewOrder) {
36303675 NewBlockOrder.push_back (CurrentBlockOrder[Node]);
36313676 }
3632- LLVM_DEBUG (
3633- dbgs () << format ( " optimized layout score: %0.2f \n " ,
3634- calcExtTspScore (NewOrder, BlockSizes, JumpCounts) ));
3677+ const double OptScore =
3678+ calcExtTspScore (NewOrder, BlockSizes, BlockCounts, JumpCounts);
3679+ LLVM_DEBUG ( dbgs () << format ( " optimized layout score: %0.2f \n " , OptScore ));
36353680
3636- // Assign new block order.
3637- assignBlockOrder (NewBlockOrder);
3681+ // If the optimization is unsuccessful, fall back to the original block order.
3682+ if (OptForSize && OrgScore > OptScore)
3683+ assignBlockOrder (CurrentBlockOrder);
3684+ else
3685+ assignBlockOrder (NewBlockOrder);
36383686}
36393687
36403688void MachineBlockPlacement::assignBlockOrder (
0 commit comments