1616#include " AIE.h"
1717#include " AIEBaseInstrInfo.h"
1818#include " AIEBaseRegisterInfo.h"
19+ #include " AIEDataDependenceHelper.h"
20+ #include " AIELoopClass.h"
21+ #include " AIESlotStatistics.h"
1922#include " Utils/AIELoopUtils.h"
2023
2124#include " llvm/ADT/BitVector.h"
@@ -43,6 +46,9 @@ using namespace llvm;
4346
4447#define DEBUG_TYPE " aie-waw-reg-rewrite"
4548
49+ // This might be compatible with a future extension of the DEBUG rigging
50+ #define DEBUG_DETAIL (x ) DEBUG_WITH_TYPE(" aie-waw-reg-rewrite:2" , x)
51+
4652static cl::opt<bool > AggressiveReAlloc (
4753 " aie-aggressive-realloc" , cl::Hidden, cl::init(false ),
4854 cl::desc(" Aggressively de-allocate live-through registers to favor "
@@ -64,6 +70,13 @@ static cl::opt<bool>
6470 LatencyAware (" aie-realloc-latencyaware" , cl::Hidden, cl::init(true ),
6571 cl::desc(" Enable latency-aware allocation strategy" ));
6672
73+ static cl::opt<bool >
74+ SWPAware (" aie-realloc-swp-aware" , cl::Hidden, cl::init(false ),
75+ cl::desc(" Use assignment order based on interleaved swp stages" ));
76+
77+ static cl::opt<int > MinIIBias (" aie-realloc-ii-bias" , cl::Hidden, cl::init(0 ),
78+ cl::desc(" MinII bias for swp-aware" ));
79+
6780namespace {
6881
6982// Defines the next register to use in reallocation.
@@ -104,6 +117,7 @@ class AIEWawRegRewriter : public MachineFunctionPass {
104117 AU.addRequired <LiveIntervalsWrapperPass>();
105118 AU.addPreserved <LiveIntervalsWrapperPass>();
106119 AU.addRequired <LiveRegMatrixWrapperLegacy>();
120+ AU.addRequired <AAResultsWrapperPass>();
107121 AU.addPreserved <LiveRegMatrixWrapperLegacy>();
108122 MachineFunctionPass::getAnalysisUsage (AU);
109123 }
@@ -129,6 +143,9 @@ class AIEWawRegRewriter : public MachineFunctionPass {
129143 RoundRobin computeLRURegisters (
130144 const std::map<const TargetRegisterClass *, bool > &RegClasses);
131145
146+ // / Sort the candidates to mimic interleaving the pipeline stages
147+ void sortSWPAware (OriginalAllocation &Candidates, MachineBasicBlock &MBB);
148+
132149 // / Pre-allocate all virtual registers in Candidates. The sole purpose of
133150 // / this is to prime the LRURegisters, so that the end of the loop is
134151 // / considered to be near to the start. No actual allocations are made.
@@ -382,6 +399,55 @@ RoundRobin AIEWawRegRewriter::computeLRURegisters(
382399 return LRURegisters;
383400}
384401
402+ void AIEWawRegRewriter::sortSWPAware (OriginalAllocation &Candidates,
403+ MachineBasicBlock &MBB) {
404+
405+ // We estimate the length of the schedule based on latencies and the
406+ // minimum II based on slots. We then estimate the modulo cycle of each
407+ // instruction based on its depth and apply LRU in the order of the modulo
408+ // cycle.
409+ // Note that both the depth and the II are underestimations since we don't
410+ // account for them interfering. Hence the modulo cycle estimate won't be
411+ // too far off.
412+ AIE::SlotStatistics Statistics = AIE::computeSlotStatistics (MBB, TII);
413+ DEBUG_DETAIL (dbgs () << " Stats=" ; Statistics.dumpShort (); dbgs () << " \n " );
414+ DEBUG_DETAIL (dbgs () << " LoopClass=" << llvm::AIE::classifyLoop (Statistics)
415+ << " \n " );
416+ const int MinII = std::max (Statistics.getMinII () + MinIIBias, 1 );
417+
418+ MachineSchedContext Context;
419+ Context.MF = MF;
420+ Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults ();
421+ AIE::DataDependenceHelper DDG (Context, true , false );
422+ for (auto &MI : MBB) {
423+ if (!MI.isTerminator ())
424+ DDG.initSUnit (MI);
425+ }
426+ DDG.buildEdges ();
427+ DEBUG_DETAIL (DDG.dumpDot (dbgs (), false ));
428+
429+ // Compute and record the modulo cycle of each instruction.
430+ std::map<const MachineInstr *, int > ModuloCycle;
431+ for (auto &SU : DDG.SUnits ) {
432+ int D = SU.getDepth ();
433+ ModuloCycle.emplace (SU.getInstr (), D % MinII);
434+ LLVM_DEBUG (dbgs () << format (" %4d D=%4d: " , SU.NodeNum , D)
435+ << *SU.getInstr ());
436+ }
437+
438+ LLVM_DEBUG (dbgs () << format (" MinII = %d\n " , MinII));
439+
440+ // Now sort the candidates to simulate the parallelism
441+ using Element = std::pair<const MachineOperand *, Register>;
442+ auto ModuloCycleLess = [&ModuloCycle](const Element &A, const Element &B) {
443+ const MachineInstr *IA = A.first ->getParent ();
444+ const MachineInstr *IB = B.first ->getParent ();
445+
446+ return ModuloCycle[IA] < ModuloCycle[IB];
447+ };
448+ llvm::sort (Candidates, ModuloCycleLess);
449+ }
450+
385451bool AIEWawRegRewriter::renameMBBPhysRegs (const MachineBasicBlock *MBB) {
386452 LLVM_DEBUG (dbgs () << " WAW Reg Renaming BasicBlock " ; MBB->dump ();
387453 dbgs () << " \n " );
@@ -473,13 +539,10 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
473539 }
474540 }
475541
476- // For each reg class, allocate the candidates in round-robin fashion.
477- // If we fail, we fall back to the original allocation
478- BitVector ExcludedPhysRegs{TRI->getNumRegs ()};
479-
480- // Exclude CSRs
481- for (const MCPhysReg *CSR = MRI->getCalleeSavedRegs (); CSR && *CSR; ++CSR)
482- ExcludedPhysRegs[*CSR] = true ;
542+ if (SWPAware) {
543+ auto &NCMBB = *(const_cast <MachineBasicBlock *>(MBB));
544+ sortSWPAware (Candidates, NCMBB);
545+ }
483546
484547 // Least-Recently-Used list of physical registers for assignments to VRegs.
485548 // Physical registers that have recently been used are moved to the back.
0 commit comments