Skip to content

Commit f5990cf

Browse files
Martien de Jongmartien-de-jong
authored andcommitted
[AIE][WAWRewriter] Simulate a pipeline schedule for the LRU renaming
Reorder the allocation order of the candidates based on an approximate pipeline schedule.
1 parent 61dfcf4 commit f5990cf

File tree

1 file changed

+70
-7
lines changed

1 file changed

+70
-7
lines changed

llvm/lib/Target/AIE/AIEWawRegRewriter.cpp

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#include "AIE.h"
1717
#include "AIEBaseInstrInfo.h"
1818
#include "AIEBaseRegisterInfo.h"
19+
#include "AIEDataDependenceHelper.h"
20+
#include "AIELoopClass.h"
21+
#include "AIESlotStatistics.h"
1922
#include "Utils/AIELoopUtils.h"
2023

2124
#include "llvm/ADT/BitVector.h"
@@ -43,6 +46,9 @@ using namespace llvm;
4346

4447
#define DEBUG_TYPE "aie-waw-reg-rewrite"
4548

49+
// This might be compatible with a future extension of the DEBUG rigging
50+
#define DEBUG_DETAIL(x) DEBUG_WITH_TYPE("aie-waw-reg-rewrite:2", x)
51+
4652
static cl::opt<bool> AggressiveReAlloc(
4753
"aie-aggressive-realloc", cl::Hidden, cl::init(false),
4854
cl::desc("Aggressively de-allocate live-through registers to favor "
@@ -64,6 +70,13 @@ static cl::opt<bool>
6470
LatencyAware("aie-realloc-latencyaware", cl::Hidden, cl::init(true),
6571
cl::desc("Enable latency-aware allocation strategy"));
6672

73+
static cl::opt<bool>
74+
SWPAware("aie-realloc-swp-aware", cl::Hidden, cl::init(false),
75+
cl::desc("Use assignment order based on interleaved swp stages"));
76+
77+
static cl::opt<int> MinIIBias("aie-realloc-ii-bias", cl::Hidden, cl::init(0),
78+
cl::desc("MinII bias for swp-aware"));
79+
6780
namespace {
6881

6982
// Defines the next register to use in reallocation.
@@ -104,6 +117,7 @@ class AIEWawRegRewriter : public MachineFunctionPass {
104117
AU.addRequired<LiveIntervalsWrapperPass>();
105118
AU.addPreserved<LiveIntervalsWrapperPass>();
106119
AU.addRequired<LiveRegMatrixWrapperLegacy>();
120+
AU.addRequired<AAResultsWrapperPass>();
107121
AU.addPreserved<LiveRegMatrixWrapperLegacy>();
108122
MachineFunctionPass::getAnalysisUsage(AU);
109123
}
@@ -129,6 +143,9 @@ class AIEWawRegRewriter : public MachineFunctionPass {
129143
RoundRobin computeLRURegisters(
130144
const std::map<const TargetRegisterClass *, bool> &RegClasses);
131145

146+
/// Sort the candidates to mimic interleaving the pipeline stages
147+
void sortSWPAware(OriginalAllocation &Candidates, MachineBasicBlock &MBB);
148+
132149
/// Pre-allocate all virtual registers in Candidates. The sole purpose of
133150
/// this is to prime the LRURegisters, so that the end of the loop is
134151
/// considered to be near to the start. No actual allocations are made.
@@ -382,6 +399,55 @@ RoundRobin AIEWawRegRewriter::computeLRURegisters(
382399
return LRURegisters;
383400
}
384401

402+
void AIEWawRegRewriter::sortSWPAware(OriginalAllocation &Candidates,
403+
MachineBasicBlock &MBB) {
404+
405+
// We estimate the length of the schedule based on latencies and the
406+
// minimum II based on slots. We then estimate the modulo cycle of each
407+
// instruction based on its depth and apply LRU in the order of the modulo
408+
// cycle.
409+
// Note that both the depth and the II are underestimations since we don't
410+
// account for them interfering. Hence the modulo cycle estimate won't be
411+
// too far off.
412+
AIE::SlotStatistics Statistics = AIE::computeSlotStatistics(MBB, TII);
413+
DEBUG_DETAIL(dbgs() << "Stats="; Statistics.dumpShort(); dbgs() << "\n");
414+
DEBUG_DETAIL(dbgs() << "LoopClass=" << llvm::AIE::classifyLoop(Statistics)
415+
<< "\n");
416+
const int MinII = std::max(Statistics.getMinII() + MinIIBias, 1);
417+
418+
MachineSchedContext Context;
419+
Context.MF = MF;
420+
Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
421+
AIE::DataDependenceHelper DDG(Context, true, false);
422+
for (auto &MI : MBB) {
423+
if (!MI.isTerminator())
424+
DDG.initSUnit(MI);
425+
}
426+
DDG.buildEdges();
427+
DEBUG_DETAIL(DDG.dumpDot(dbgs(), false));
428+
429+
// Compute and record the modulo cycle of each instruction.
430+
std::map<const MachineInstr *, int> ModuloCycle;
431+
for (auto &SU : DDG.SUnits) {
432+
int D = SU.getDepth();
433+
ModuloCycle.emplace(SU.getInstr(), D % MinII);
434+
LLVM_DEBUG(dbgs() << format("%4d D=%4d: ", SU.NodeNum, D)
435+
<< *SU.getInstr());
436+
}
437+
438+
LLVM_DEBUG(dbgs() << format("MinII = %d\n", MinII));
439+
440+
// Now sort the candidates to simulate the parallelism
441+
using Element = std::pair<const MachineOperand *, Register>;
442+
auto ModuloCycleLess = [&ModuloCycle](const Element &A, const Element &B) {
443+
const MachineInstr *IA = A.first->getParent();
444+
const MachineInstr *IB = B.first->getParent();
445+
446+
return ModuloCycle[IA] < ModuloCycle[IB];
447+
};
448+
llvm::sort(Candidates, ModuloCycleLess);
449+
}
450+
385451
bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
386452
LLVM_DEBUG(dbgs() << "WAW Reg Renaming BasicBlock "; MBB->dump();
387453
dbgs() << "\n");
@@ -473,13 +539,10 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
473539
}
474540
}
475541

476-
// For each reg class, allocate the candidates in round-robin fashion.
477-
// If we fail, we fall back to the original allocation
478-
BitVector ExcludedPhysRegs{TRI->getNumRegs()};
479-
480-
// Exclude CSRs
481-
for (const MCPhysReg *CSR = MRI->getCalleeSavedRegs(); CSR && *CSR; ++CSR)
482-
ExcludedPhysRegs[*CSR] = true;
542+
if (SWPAware) {
543+
auto &NCMBB = *(const_cast<MachineBasicBlock *>(MBB));
544+
sortSWPAware(Candidates, NCMBB);
545+
}
483546

484547
// Least-Recently-Used list of physical registers for assignments to VRegs.
485548
// Physical registers that have recently been used are moved to the back.

0 commit comments

Comments
 (0)