Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 176 additions & 62 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
assert(!ST.hasExtendedWaitCounts());

if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
if (!ST.isWave64())
return false;

const bool IsSALU = SIInstrInfo::isSALU(*MI);
const bool IsVALU = SIInstrInfo::isVALU(*MI);
if (!IsSALU && !IsVALU)
return false;

// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
// 2. SALU writes SGPR
// 3. SALU reads SGPR
// The hazard can expire if the distance between 2 and 3 is sufficient.
// In practice this happens <10% of the time, hence this always assumes
// the hazard exists if 1 and 2 are present to avoid searching.
// 2. VALU/SALU writes SGPR
// 3. VALU/SALU reads SGPR
// The hazard can expire if the distance between 2 and 3 is sufficient,
// or (2) is VALU and (3) is SALU.
// In practice this happens <10% of the time, hence always assume the hazard
// exists if (1) and (2) are present to avoid searching all SGPR reads.

const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
if (!SDSTOp || !SDSTOp->isReg())
return false;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();

auto IgnoreableSGPR = [](const Register Reg) {
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::M0:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
case AMDGPU::SCC:
return true;
default:
return false;
}
};
auto IsVCC = [](const Register Reg) {
return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
};

struct StateType {
SmallSet<Register, 2> HazardSGPRs;

static unsigned getHashValue(const StateType &State) {
return hash_combine_range(State.HazardSGPRs);
}
static bool isEqual(const StateType &LHS, const StateType &RHS) {
return LHS.HazardSGPRs == RHS.HazardSGPRs;
}
};

SmallVector<const MachineInstr *> WaitInstrs;
bool HasSGPRRead = false;
StateType InitialState;

// Look for SGPR write.
MachineOperand *HazardDef = nullptr;
for (MachineOperand &Op : MI->operands()) {
if (!Op.isReg())
continue;
if (Op.isDef() && HazardDef)
continue;

Register Reg = Op.getReg();
if (IgnoreableSGPR(Reg))
continue;
if (!IsVCC(Reg)) {
if (Op.isImplicit())
continue;
if (!TRI->isSGPRReg(MRI, Reg))
continue;
}
// Also check for SGPR reads.
if (Op.isUse()) {
HasSGPRRead = true;
continue;
}

assert(!HazardDef);
HazardDef = &Op;
}

const Register HazardReg = SDSTOp->getReg();
if (HazardReg == AMDGPU::EXEC ||
HazardReg == AMDGPU::EXEC_LO ||
HazardReg == AMDGPU::EXEC_HI ||
HazardReg == AMDGPU::M0)
if (!HazardDef)
return false;

auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
// Setup to track writes to individual SGPRs
const Register HazardReg = HazardDef->getReg();
if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
InitialState.HazardSGPRs.insert(HazardReg);
} else {
assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
}

auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
if (State.HazardSGPRs.empty())
return HazardExpired;

switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
Expand All @@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
case AMDGPU::V_SUBBREV_U32_dpp:
case AMDGPU::V_SUBBREV_U32_dpp: {
// These implicitly read VCC as mask source.
return HazardReg == AMDGPU::VCC ||
HazardReg == AMDGPU::VCC_LO ||
HazardReg == AMDGPU::VCC_HI;
return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
}
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_t16_e64:
Expand All @@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
return Result ? HazardFound : NoHazardFound;
}
default:
return false;
return NoHazardFound;
}
};

const MachineRegisterInfo &MRI = MF.getRegInfo();
auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
return true;

// VALU access to any SGPR or literal constant other than HazardReg
// mitigates hazard. No need to check HazardReg here as this will
// only be called when !IsHazardFn.
if (!SIInstrInfo::isVALU(I))
return false;
for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
const MachineOperand &Op = I.getOperand(OpNo);
if (Op.isReg()) {
Register OpReg = Op.getReg();
// Only consider uses
if (!Op.isUse())
const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
0);
auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
switch (I.getOpcode()) {
case AMDGPU::S_WAITCNT_DEPCTR:
// Record mergable waits within region of instructions free of SGPR reads.
if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
(I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
WaitInstrs.push_back(&I);
break;
default:
// Update tracking of SGPR reads and writes.
for (auto &Op : I.operands()) {
if (!Op.isReg())
continue;
// Ignore EXEC
if (OpReg == AMDGPU::EXEC ||
OpReg == AMDGPU::EXEC_LO ||
OpReg == AMDGPU::EXEC_HI)

Register Reg = Op.getReg();
if (IgnoreableSGPR(Reg))
continue;
// Ignore all implicit uses except VCC
if (Op.isImplicit()) {
if (OpReg == AMDGPU::VCC ||
OpReg == AMDGPU::VCC_LO ||
OpReg == AMDGPU::VCC_HI)
return true;
if (!IsVCC(Reg)) {
if (Op.isImplicit())
continue;
if (!TRI->isSGPRReg(MRI, Reg))
continue;
}
if (Op.isUse()) {
HasSGPRRead = true;
continue;
}
if (TRI.isSGPRReg(MRI, OpReg))
return true;
} else {
const MCInstrDesc &InstDesc = I.getDesc();
const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
if (!TII.isInlineConstant(Op, OpInfo))
return true;

// Stop tracking any SGPRs with writes on the basis that they will
// already have an appropriate wait inserted afterwards.
SmallVector<Register, 2> Found;
for (Register SGPR : State.HazardSGPRs) {
if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
Found.push_back(SGPR);
}
for (Register SGPR : Found)
State.HazardSGPRs.erase(SGPR);
}
break;
}
return false;
};

// Check for hazard
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
std::numeric_limits<int>::max())
if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
MI->getParent(),
std::next(MI->getReverseIterator())))
return false;

auto NextMI = std::next(MI->getIterator());
// Compute counter mask
unsigned DepCtr =
IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
: AMDGPU::DepCtr::encodeFieldVaSdst(0))
: AMDGPU::DepCtr::encodeFieldSaSdst(0);

// Try to merge previous waits into this one for regions with no SGPR reads.
if (!WaitInstrs.empty()) {
// Note: WaitInstrs contains const pointers, so walk backward from MI to
// obtain a mutable pointer to each instruction to be merged.
// This is expected to be a very short walk within the same block.
SmallVector<MachineInstr *> ToErase;
unsigned Found = 0;
for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
End = MI->getParent()->rend();
Found < WaitInstrs.size() && It != End; ++It) {
MachineInstr *WaitMI = &*It;
// Find next wait instruction.
if (std::as_const(WaitMI) != WaitInstrs[Found])
continue;
Found++;
unsigned WaitMask = WaitMI->getOperand(0).getImm();
assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
ToErase.push_back(WaitMI);
}
assert(Found == WaitInstrs.size());
for (MachineInstr *WaitMI : ToErase)
WaitMI->eraseFromParent();
}

// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
// Add s_waitcnt_depctr after SGPR write.
auto NextMI = std::next(MI->getIterator());
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
.addImm(DepCtr);

// SALU write may be s_getpc in a bundle.
updateGetPCBundle(NewMI);
Expand Down
Loading