Skip to content

Commit ef2d6d9

Browse files
author
Jinsong Ji
committed
[PowerPC] Enable MachinePipeliner for P9 with -ppc-enable-pipeliner
Implement necessary target hooks to enable MachinePipeliner for P9 only. The pass is off by default, can be enabled with -ppc-enable-pipeliner for P9. Differential Revision: https://reviews.llvm.org/D62164 llvm-svn: 363085
1 parent 10c0855 commit ef2d6d9

File tree

12 files changed

+227
-19
lines changed

12 files changed

+227
-19
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
318318
MBBVectorTy &EpilogBBs);
319319
void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
320320
SMSchedule &Schedule);
321-
void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
322-
MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
323-
ValueMapTy *VRMap);
321+
void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
322+
MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
323+
SMSchedule &Schedule, ValueMapTy *VRMap);
324324
bool computeDelta(MachineInstr &MI, unsigned &Delta);
325325
void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
326326
unsigned Num);

llvm/include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -670,8 +670,9 @@ class TargetInstrInfo : public MCInstrInfo {
670670
/// is finished. Return the value/register of the new loop count. We need
671671
/// this function when peeling off one or more iterations of a loop. This
672672
/// function assumes the nth iteration is peeled first.
673-
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
674-
MachineInstr &Cmp,
673+
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
674+
MachineBasicBlock &PreHeader,
675+
MachineInstr *IndVar, MachineInstr &Cmp,
675676
SmallVectorImpl<MachineOperand> &Cond,
676677
SmallVectorImpl<MachineInstr *> &PrevInsts,
677678
unsigned Iter, unsigned MaxIter) const {

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
193193
/// for preRA scheduling with the source level scheduler.
194194
virtual bool enableMachineSchedDefaultSched() const { return true; }
195195

196+
/// True if the subtarget should run MachinePipeliner
197+
virtual bool enableMachinePipeliner() const { return true; };
198+
196199
/// True if the subtarget should enable joining global copies.
197200
///
198201
/// By default this is enabled if the machine scheduler is enabled, but

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
187187
!EnableSWPOptSize.getPosition())
188188
return false;
189189

190+
if (!mf.getSubtarget().enableMachinePipeliner())
191+
return false;
192+
190193
// Cannot pipeline loops without instruction itineraries if we are using
191194
// DFA for the pipeliner.
192195
if (mf.getSubtarget().useDFAforSMS() &&
@@ -2026,6 +2029,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
20262029
InstrMapTy InstrMap;
20272030

20282031
SmallVector<MachineBasicBlock *, 4> PrologBBs;
2032+
2033+
MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2034+
assert(PreheaderBB != nullptr &&
2035+
"Need to add code to handle loops w/o preheader");
20292036
// Generate the prolog instructions that set up the pipeline.
20302037
generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
20312038
MF.insert(BB->getIterator(), KernelBB);
@@ -2082,7 +2089,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
20822089
removeDeadInstructions(KernelBB, EpilogBBs);
20832090

20842091
// Add branches between prolog and epilog blocks.
2085-
addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2092+
addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
20862093

20872094
// Remove the original loop since it's no longer referenced.
20882095
for (auto &I : *BB)
@@ -2767,7 +2774,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
27672774
/// Create branches from each prolog basic block to the appropriate epilog
27682775
/// block. These edges are needed if the loop ends before reaching the
27692776
/// kernel.
2770-
void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2777+
void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
2778+
MBBVectorTy &PrologBBs,
27712779
MachineBasicBlock *KernelBB,
27722780
MBBVectorTy &EpilogBBs,
27732781
SMSchedule &Schedule, ValueMapTy *VRMap) {
@@ -2794,8 +2802,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
27942802
// Check if the LOOP0 has already been removed. If so, then there is no need
27952803
// to reduce the trip count.
27962804
if (LC != 0)
2797-
LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
2798-
MaxIter);
2805+
LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
2806+
PrevInsts, j, MaxIter);
27992807

28002808
// Record the value of the first trip count, which is used to determine if
28012809
// branches and blocks can be removed for constant trip counts.

llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -697,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
697697
/// Generate code to reduce the loop iteration by one and check if the loop is
698698
/// finished. Return the value/register of the new loop count. this function
699699
/// assumes the nth iteration is peeled first.
700-
unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
701-
MachineInstr *IndVar, MachineInstr &Cmp,
702-
SmallVectorImpl<MachineOperand> &Cond,
703-
SmallVectorImpl<MachineInstr *> &PrevInsts,
704-
unsigned Iter, unsigned MaxIter) const {
700+
unsigned HexagonInstrInfo::reduceLoopCount(
701+
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
702+
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
703+
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
704+
unsigned MaxIter) const {
705705
// We expect a hardware loop currently. This means that IndVar is set
706706
// to null, and the compare is the ENDLOOP instruction.
707707
assert((!IndVar) && isEndLoopN(Cmp.getOpcode())

llvm/lib/Target/Hexagon/HexagonInstrInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
139139
/// is finished. Return the value/register of the new loop count. We need
140140
/// this function when peeling off one or more iterations of a loop. This
141141
/// function assumes the nth iteration is peeled first.
142-
unsigned reduceLoopCount(MachineBasicBlock &MBB,
142+
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
143143
MachineInstr *IndVar, MachineInstr &Cmp,
144144
SmallVectorImpl<MachineOperand> &Cond,
145145
SmallVectorImpl<MachineInstr *> &PrevInsts,

llvm/lib/Target/PowerPC/PPCInstrInfo.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3922,3 +3922,77 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
39223922
}
39233923
return false;
39243924
}
3925+
3926+
bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
3927+
return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
3928+
}
3929+
3930+
bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
3931+
MachineInstr *&CmpInst) const {
3932+
MachineBasicBlock *LoopEnd = L.getBottomBlock();
3933+
MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
3934+
// We really "analyze" only CTR loops right now.
3935+
if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
3936+
IndVarInst = nullptr;
3937+
CmpInst = &*I;
3938+
return false;
3939+
}
3940+
return true;
3941+
}
3942+
3943+
MachineInstr *
3944+
PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
3945+
3946+
unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
3947+
3948+
// The loop set-up instruction should be in preheader
3949+
for (auto &I : PreHeader.instrs())
3950+
if (I.getOpcode() == LOOPi)
3951+
return &I;
3952+
return nullptr;
3953+
}
3954+
3955+
unsigned PPCInstrInfo::reduceLoopCount(
3956+
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
3957+
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
3958+
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
3959+
unsigned MaxIter) const {
3960+
// We expect a hardware loop currently. This means that IndVar is set
3961+
// to null, and the compare is the ENDLOOP instruction.
3962+
assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
3963+
MachineFunction *MF = MBB.getParent();
3964+
DebugLoc DL = Cmp.getDebugLoc();
3965+
MachineInstr *Loop = findLoopInstr(PreHeader);
3966+
if (!Loop)
3967+
return 0;
3968+
unsigned LoopCountReg = Loop->getOperand(0).getReg();
3969+
MachineRegisterInfo &MRI = MF->getRegInfo();
3970+
MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
3971+
3972+
if (!LoopCount)
3973+
return 0;
3974+
// If the loop trip count is a compile-time value, then just change the
3975+
// value.
3976+
if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
3977+
int64_t Offset = LoopCount->getOperand(1).getImm();
3978+
if (Offset <= 1) {
3979+
LoopCount->eraseFromParent();
3980+
Loop->eraseFromParent();
3981+
return 0;
3982+
}
3983+
LoopCount->getOperand(1).setImm(Offset - 1);
3984+
return Offset - 1;
3985+
}
3986+
3987+
// The loop trip count is a run-time value.
3988+
// We need to subtract one from the trip count,
3989+
// and insert branch later to check if we're done with the loop.
3990+
3991+
// Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
3992+
// so we don't need to generate any thing here.
3993+
Cond.push_back(MachineOperand::CreateImm(0));
3994+
Cond.push_back(MachineOperand::CreateReg(
3995+
Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
3996+
return LoopCountReg;
3997+
}
3998+

llvm/lib/Target/PowerPC/PPCInstrInfo.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,34 @@ class PPCInstrInfo : public PPCGenInstrInfo {
457457
}
458458
return Reg;
459459
}
460+
461+
/// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero).
462+
bool isBDNZ(unsigned Opcode) const;
463+
464+
/// Find the hardware loop instruction used to set-up the specified loop.
465+
/// On PPC, we have two instructions used to set-up the hardware loop
466+
/// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
467+
/// instructions to indicate the end of a loop.
468+
MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
469+
470+
/// Analyze the loop code to find the loop induction variable and compare used
471+
/// to compute the number of iterations. Currently, we analyze loop that are
472+
/// controlled using hardware loops. In this case, the induction variable
473+
/// instruction is null. For all other cases, this function returns true,
474+
/// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
475+
/// return new values when we can analyze the readonly loop \p L, otherwise,
476+
/// nothing got changed
477+
bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
478+
MachineInstr *&CmpInst) const override;
479+
/// Generate code to reduce the loop iteration by one and check if the loop
480+
/// is finished. Return the value/register of the new loop count. We need
481+
/// this function when peeling off one or more iterations of a loop. This
482+
/// function assumes the last iteration is peeled first.
483+
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
484+
MachineInstr *IndVar, MachineInstr &Cmp,
485+
SmallVectorImpl<MachineOperand> &Cond,
486+
SmallVectorImpl<MachineInstr *> &PrevInsts,
487+
unsigned Iter, unsigned MaxIter) const override;
460488
};
461489

462490
}

llvm/lib/Target/PowerPC/PPCSubtarget.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
3939
cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
4040
cl::Hidden);
4141

42+
static cl::opt<bool>
43+
EnableMachinePipeliner("ppc-enable-pipeliner",
44+
cl::desc("Enable Machine Pipeliner for PPC"),
45+
cl::init(false), cl::Hidden);
46+
4247
PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
4348
StringRef FS) {
4449
initializeEnvironment();
@@ -181,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
181186
return false;
182187
}
183188

184-
bool PPCSubtarget::enableMachineScheduler() const {
185-
return true;
189+
bool PPCSubtarget::enableMachineScheduler() const { return true; }
190+
191+
bool PPCSubtarget::enableMachinePipeliner() const {
192+
return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
186193
}
187194

195+
bool PPCSubtarget::useDFAforSMS() const { return false; }
196+
188197
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
189198
bool PPCSubtarget::enablePostRAScheduler() const { return true; }
190199

llvm/lib/Target/PowerPC/PPCSubtarget.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,13 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
322322
/// but may expand the ISEL instruction later.
323323
bool enableEarlyIfConversion() const override { return true; }
324324

325-
// Scheduling customization.
325+
/// Scheduling customization.
326326
bool enableMachineScheduler() const override;
327-
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
327+
/// Pipeliner customization.
328+
bool enableMachinePipeliner() const override;
329+
/// Machine Pipeliner customization
330+
bool useDFAforSMS() const override;
331+
/// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
328332
bool enablePostRAScheduler() const override;
329333
AntiDepBreakMode getAntiDepBreakMode() const override;
330334
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;

0 commit comments

Comments
 (0)