Skip to content

Commit bd30247

Browse files
committed
- Improve the heuristic as to when use latency reduction. Do this only when
either there are many nodes in data-sequences, or when IsAcyclicLatencyLimited is true. - Remove the lengthy handling for tiny regions containing long latency instructions - it doesn't appear to be needed. - Slight refactoring of computeSULivenessScore() with PDiffs (NFC).
1 parent 9193b2d commit bd30247

File tree

2 files changed

+61
-85
lines changed

2 files changed

+61
-85
lines changed

llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp

Lines changed: 58 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,11 @@ int SystemZPreRASchedStrategy::computeSULivenessScore(
214214
if (!MI->getNumOperands() || MI->isCopy())
215215
return 0;
216216

217-
const MachineOperand &DefMO = MI->getOperand(0);
218-
assert(!isPhysRegDef(DefMO) && "Did not expect physreg def!");
217+
const MachineOperand &MO0 = MI->getOperand(0);
218+
assert(!isPhysRegDef(MO0) && "Did not expect physreg def!");
219219
bool IsLoad =
220-
isRegDef(DefMO) && !DefMO.isDead() && !IsRedefining[SU->NodeNum];
221-
bool IsStore = (!isRegDef(DefMO) || DefMO.isDead());
220+
isRegDef(MO0) && !MO0.isDead() && !IsRedefining[SU->NodeNum];
221+
bool IsStore = (!isRegDef(MO0) || MO0.isDead());
222222
bool PreservesSchedLat = SU->getHeight() <= Zone->getScheduledLatency();
223223
const unsigned Cycles = 2;
224224
unsigned Margin = SchedModel->getIssueWidth() * (Cycles + SU->Latency - 1);
@@ -252,10 +252,10 @@ int SystemZPreRASchedStrategy::computeSULivenessScore(
252252
// Find the interesting properties.
253253
// Prioritize FP: Ignore GPR/Addr kills with an FP def.
254254
UsesLivePrio = IsLoad && !PrioKill &&
255-
(isPrioVirtReg(DefMO.getReg(), &DAG->MRI) || !GPRKill);
255+
(isPrioVirtReg(MO0.getReg(), &DAG->MRI) || !GPRKill);
256256
UsesLiveAll = !PrioKill && !GPRKill;
257257
StoreKill = (PrioKill || (!HasPrioUse && GPRKill));
258-
} else {
258+
} else if (MO0.isReg() && MO0.getReg().isVirtual()) {
259259
int PrioPressureChange = 0;
260260
int GPRPressureChange = 0;
261261
const PressureDiff &PDiff = DAG->getPressureDiff(SU);
@@ -267,22 +267,21 @@ int SystemZPreRASchedStrategy::computeSULivenessScore(
267267
else if (PC.getPSet() == GPRPressureSet)
268268
GPRPressureChange += PC.getUnitInc();
269269
}
270+
const TargetRegisterClass *RC = DAG->MRI.getRegClass(MO0.getReg());
271+
int RegWeight = TRI->getRegClassWeight(RC).RegWeight;
270272
if (IsLoad) {
271-
const TargetRegisterClass *RC = DAG->MRI.getRegClass(DefMO.getReg());
272-
int DefWeight = -int(TRI->getRegClassWeight(RC).RegWeight);
273-
bool PrioDefNoKill = PrioPressureChange == DefWeight;
274-
bool GPRDefNoKill = GPRPressureChange == DefWeight;
273+
bool PrioDefNoKill = PrioPressureChange == -RegWeight;
274+
bool GPRDefNoKill = GPRPressureChange == -RegWeight;
275275
UsesLivePrio =
276-
(PrioDefNoKill || (PrioPressureChange == 0 && GPRDefNoKill));
277-
UsesLiveAll = (PrioDefNoKill && GPRPressureChange == 0) ||
278-
(PrioPressureChange == 0 && GPRDefNoKill);
276+
(PrioDefNoKill || (!PrioPressureChange && GPRDefNoKill));
277+
UsesLiveAll = (PrioDefNoKill && !GPRPressureChange) ||
278+
(!PrioPressureChange && GPRDefNoKill);
279279
}
280-
if (IsStore && FirstStoreInGroupScheduled && StoresGroup.count(SU)) {
281-
Register SrcReg = MI->getOperand(0).getReg();
282-
bool SrcKill = !DAG->getBotRPTracker().isRegLive(SrcReg);
280+
else if (IsStore && FirstStoreInGroupScheduled && StoresGroup.count(SU)) {
281+
bool SrcKill = !DAG->getBotRPTracker().isRegLive(MO0.getReg());
283282
StoreKill =
284-
SrcKill && (PrioPressureChange > 0 ||
285-
(PrioPressureChange == 0 && GPRPressureChange > 0));
283+
SrcKill && (PrioPressureChange == RegWeight ||
284+
(!PrioPressureChange && GPRPressureChange == RegWeight));
286285
}
287286
}
288287

@@ -342,8 +341,10 @@ bool SystemZPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
342341
if (tryLess(TryCandScore, CandScore, TryCand, Cand, LivenessReduce))
343342
return TryCand.Reason != NoCand;
344343

345-
// Don't extend the scheduled latency.
346-
if (ShouldReduceLatency &&
344+
// Don't extend the scheduled latency in regions with many nodes in
345+
// simple data sequences, or for (single block loop) regions that are
346+
// acyclically (within a single loop iteration) latency limited.
347+
if ((HasDataSequences || Rem.IsAcyclicLatencyLimited) &&
347348
TryCand.SU->getHeight() != Cand.SU->getHeight() &&
348349
(std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
349350
Zone->getScheduledLatency())) {
@@ -392,79 +393,53 @@ void SystemZPreRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
392393
void SystemZPreRASchedStrategy::initialize(ScheduleDAGMI *dag) {
393394
GenericScheduler::initialize(dag);
394395

395-
const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(DAG->TII);
396-
if (TinyRegion) {
397-
// A tiny region with long latency instructions is better handled using
398-
// normal heuristics, except in regions that have COPYs of a physreg both
399-
// ways and/or have a compare-0 likely to be eliminated.
400-
const SUnit *CmpZeroSU = nullptr;
401-
const SUnit *CmpSrcSU = nullptr;
402-
Register CmpSrcReg = 0;
403-
bool OtherCCClob = false;
404-
unsigned MaxLat = 0;
405-
std::set<Register> PRegs;
406-
bool CopysPRegDep = false;
407-
for (unsigned Idx = DAG->SUnits.size() - 1; Idx + 1 != 0; --Idx) {
408-
const SUnit *SU = &DAG->SUnits[Idx];
409-
const MachineInstr *MI = SU->getInstr();
410-
411-
// Check for a (likely) eliminable compare-0.
412-
if (TII->isCompareZero(*MI)) {
413-
CmpZeroSU = SU;
414-
CmpSrcReg = TII->getCompareSourceReg(*MI);
415-
continue;
416-
}
417-
if (MI->getNumOperands()) {
418-
const MachineOperand &DefMO = MI->getOperand(0);
419-
// Doing this instead of SU data preds happens to also handle the
420-
// case where CmpSrcReg is redefined.
421-
if (isVirtRegDef(DefMO) && DefMO.getReg() == CmpSrcReg &&
422-
MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
423-
CmpSrcSU = SU;
424-
}
425-
if (SU != CmpZeroSU && SU != CmpSrcSU &&
426-
MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
427-
OtherCCClob = true;
428-
429-
// Check for long latency instructions.
430-
MaxLat = std::max(MaxLat, unsigned(SU->Latency));
431-
432-
// Check for COPYs of pregs both in and out of the region.
433-
if (MI->isCopy()) {
434-
Register DstReg = MI->getOperand(0).getReg();
435-
Register SrcReg = MI->getOperand(1).getReg();
436-
if (DstReg.isPhysical() && DAG->MRI.isAllocatable(DstReg) &&
437-
SrcReg.isVirtual())
438-
PRegs.insert(DstReg);
439-
else if (SrcReg.isPhysical() && DAG->MRI.isAllocatable(SrcReg) &&
440-
DstReg.isVirtual()) {
441-
if (!PRegs.insert(SrcReg).second)
442-
CopysPRegDep = true;
443-
}
444-
}
445-
}
446-
bool CmpElimRegion = CmpZeroSU && CmpSrcSU && OtherCCClob;
447-
448-
if (DAG->SUnits.size() > 6 && MaxLat >= 6 && !CopysPRegDep &&
449-
!CmpElimRegion)
450-
TinyRegion = false;
451-
}
452396
LLVM_DEBUG(dbgs() << "Region is" << (TinyRegion ? "" : " not") << " tiny.\n");
453397
if (TinyRegion)
454398
return;
455399

456400
NumLeft = DAG->SUnits.size();
457401
RemLat = ~0U;
458402

459-
// It seems to work best to include the latencies in this heuristic (as
460-
// opposed to something like a "unit SU height" with all latencies counted
461-
// as 1).
403+
// Enable latency reduction for a region that has a considerable amount of
404+
// data sequences so that they become interlaved. These are SUs that only
405+
// have one data predecessor / successor edge(s) to their adjacent
406+
// SU(s). Disable if region has many SUs relative to the overall height.
462407
unsigned DAGHeight = 0;
463408
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx)
464409
DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
465-
ShouldReduceLatency = DAG->SUnits.size() < 3 * std::max(DAGHeight, 1u);
466-
LLVM_DEBUG(if (ShouldReduceLatency) dbgs() << "Latency scheduling enabled.\n";
467-
else dbgs() << "Latency scheduling disabled.\n";);
410+
if ((HasDataSequences = DAG->SUnits.size() < 3 * std::max(DAGHeight, 1u))) {
411+
unsigned CurrSequence = 0, NumSeqNodes = 0;
412+
auto countSequence = [&CurrSequence, &NumSeqNodes]() {
413+
NumSeqNodes += CurrSequence >= 2 ? CurrSequence : 0;
414+
CurrSequence = 0;
415+
};
416+
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
417+
const SUnit *SU = &DAG->SUnits[Idx];
418+
bool InDataSequence = true;
419+
unsigned NumPreds = 0;
420+
for (const SDep &Pred : SU->Preds)
421+
if (++NumPreds != 1 || Pred.getKind() != SDep::Data ||
422+
Pred.getSUnit()->NodeNum != Idx - 1)
423+
InDataSequence = false;
424+
unsigned NumSuccs = 0;
425+
for (const SDep &Succ : SU->Succs)
426+
if (Succ.getSUnit() != &DAG->ExitSU &&
427+
(++NumSuccs != 1 || Succ.getKind() != SDep::Data))
428+
InDataSequence = false;
429+
if (!InDataSequence || !NumPreds)
430+
countSequence();
431+
if (InDataSequence)
432+
CurrSequence++;
433+
}
434+
countSequence();
435+
if (NumSeqNodes >= DAG->SUnits.size() / 4)
436+
LLVM_DEBUG(dbgs() << "Number of nodes in def-use sequences: "
437+
<< NumSeqNodes << ". ";);
438+
else
439+
HasDataSequences = false;
440+
}
441+
LLVM_DEBUG(dbgs() << "Latency scheduling " << (HasDataSequences ? "" : "not ")
442+
<< "enabled for data sequences.\n";);
468443

469444
// If MI uses the register it defines, record it one time here.
470445
IsRedefining = std::vector<bool>(DAG->SUnits.size(), false);

llvm/lib/Target/SystemZ/SystemZMachineScheduler.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,9 @@ class SystemZPreRASchedStrategy : public GenericScheduler {
5454
// Num instructions left to schedule.
5555
unsigned NumLeft;
5656

57-
// True if latency scheduling is enabled.
58-
bool ShouldReduceLatency;
57+
// True if the region has many instructions in def-use sequences and would
58+
// likely benefit from latency reduction.
59+
bool HasDataSequences;
5960

6061
// True if MI is also using the register it defines.
6162
std::vector<bool> IsRedefining;

0 commit comments

Comments
 (0)