Skip to content

Commit 385c121

Browse files
authored
[AMDGPU] Rework GFX11 VALU Mask Write Hazard (#138663)
Apply additional counter waits to address VALU writes to SGPRs. Rework expiry detection and apply wait coalescing to mitigate some of the additional waits.
1 parent c1779f3 commit 385c121

12 files changed

+741
-124
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 176 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
32673267
return false;
32683268
assert(!ST.hasExtendedWaitCounts());
32693269

3270-
if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
3270+
if (!ST.isWave64())
3271+
return false;
3272+
3273+
const bool IsSALU = SIInstrInfo::isSALU(*MI);
3274+
const bool IsVALU = SIInstrInfo::isVALU(*MI);
3275+
if (!IsSALU && !IsVALU)
32713276
return false;
32723277

32733278
// The hazard sequence is three instructions:
32743279
// 1. VALU reads SGPR as mask
3275-
// 2. SALU writes SGPR
3276-
// 3. SALU reads SGPR
3277-
// The hazard can expire if the distance between 2 and 3 is sufficient.
3278-
// In practice this happens <10% of the time, hence this always assumes
3279-
// the hazard exists if 1 and 2 are present to avoid searching.
3280+
// 2. VALU/SALU writes SGPR
3281+
// 3. VALU/SALU reads SGPR
3282+
// The hazard can expire if the distance between 2 and 3 is sufficient,
3283+
// or (2) is VALU and (3) is SALU.
3284+
// In practice this happens <10% of the time, hence always assume the hazard
3285+
// exists if (1) and (2) are present to avoid searching all SGPR reads.
32803286

3281-
const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
3282-
if (!SDSTOp || !SDSTOp->isReg())
3283-
return false;
3287+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3288+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3289+
3290+
auto IgnoreableSGPR = [](const Register Reg) {
3291+
switch (Reg) {
3292+
case AMDGPU::EXEC:
3293+
case AMDGPU::EXEC_LO:
3294+
case AMDGPU::EXEC_HI:
3295+
case AMDGPU::M0:
3296+
case AMDGPU::SGPR_NULL:
3297+
case AMDGPU::SGPR_NULL64:
3298+
case AMDGPU::SCC:
3299+
return true;
3300+
default:
3301+
return false;
3302+
}
3303+
};
3304+
auto IsVCC = [](const Register Reg) {
3305+
return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3306+
};
3307+
3308+
struct StateType {
3309+
SmallSet<Register, 2> HazardSGPRs;
3310+
3311+
static unsigned getHashValue(const StateType &State) {
3312+
return hash_combine_range(State.HazardSGPRs);
3313+
}
3314+
static bool isEqual(const StateType &LHS, const StateType &RHS) {
3315+
return LHS.HazardSGPRs == RHS.HazardSGPRs;
3316+
}
3317+
};
3318+
3319+
SmallVector<const MachineInstr *> WaitInstrs;
3320+
bool HasSGPRRead = false;
3321+
StateType InitialState;
3322+
3323+
// Look for SGPR write.
3324+
MachineOperand *HazardDef = nullptr;
3325+
for (MachineOperand &Op : MI->operands()) {
3326+
if (!Op.isReg())
3327+
continue;
3328+
if (Op.isDef() && HazardDef)
3329+
continue;
3330+
3331+
Register Reg = Op.getReg();
3332+
if (IgnoreableSGPR(Reg))
3333+
continue;
3334+
if (!IsVCC(Reg)) {
3335+
if (Op.isImplicit())
3336+
continue;
3337+
if (!TRI->isSGPRReg(MRI, Reg))
3338+
continue;
3339+
}
3340+
// Also check for SGPR reads.
3341+
if (Op.isUse()) {
3342+
HasSGPRRead = true;
3343+
continue;
3344+
}
3345+
3346+
assert(!HazardDef);
3347+
HazardDef = &Op;
3348+
}
32843349

3285-
const Register HazardReg = SDSTOp->getReg();
3286-
if (HazardReg == AMDGPU::EXEC ||
3287-
HazardReg == AMDGPU::EXEC_LO ||
3288-
HazardReg == AMDGPU::EXEC_HI ||
3289-
HazardReg == AMDGPU::M0)
3350+
if (!HazardDef)
32903351
return false;
32913352

3292-
auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
3353+
// Setup to track writes to individual SGPRs
3354+
const Register HazardReg = HazardDef->getReg();
3355+
if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
3356+
InitialState.HazardSGPRs.insert(HazardReg);
3357+
} else {
3358+
assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
3359+
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3360+
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3361+
}
3362+
3363+
auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3364+
if (State.HazardSGPRs.empty())
3365+
return HazardExpired;
3366+
32933367
switch (I.getOpcode()) {
32943368
case AMDGPU::V_ADDC_U32_e32:
32953369
case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33043378
case AMDGPU::V_SUBB_U32_e32:
33053379
case AMDGPU::V_SUBB_U32_dpp:
33063380
case AMDGPU::V_SUBBREV_U32_e32:
3307-
case AMDGPU::V_SUBBREV_U32_dpp:
3381+
case AMDGPU::V_SUBBREV_U32_dpp: {
33083382
// These implicitly read VCC as mask source.
3309-
return HazardReg == AMDGPU::VCC ||
3310-
HazardReg == AMDGPU::VCC_LO ||
3311-
HazardReg == AMDGPU::VCC_HI;
3383+
return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
3384+
}
33123385
case AMDGPU::V_ADDC_U32_e64:
33133386
case AMDGPU::V_ADDC_U32_e64_dpp:
33143387
case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33243397
// Only check mask register overlaps.
33253398
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
33263399
assert(SSRCOp);
3327-
return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
3400+
bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
3401+
return Result ? HazardFound : NoHazardFound;
33283402
}
33293403
default:
3330-
return false;
3404+
return NoHazardFound;
33313405
}
33323406
};
33333407

3334-
const MachineRegisterInfo &MRI = MF.getRegInfo();
3335-
auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
3336-
// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3337-
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3338-
AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
3339-
return true;
3340-
3341-
// VALU access to any SGPR or literal constant other than HazardReg
3342-
// mitigates hazard. No need to check HazardReg here as this will
3343-
// only be called when !IsHazardFn.
3344-
if (!SIInstrInfo::isVALU(I))
3345-
return false;
3346-
for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
3347-
const MachineOperand &Op = I.getOperand(OpNo);
3348-
if (Op.isReg()) {
3349-
Register OpReg = Op.getReg();
3350-
// Only consider uses
3351-
if (!Op.isUse())
3408+
const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
3409+
AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
3410+
0);
3411+
auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3412+
switch (I.getOpcode()) {
3413+
case AMDGPU::S_WAITCNT_DEPCTR:
3414+
// Record mergable waits within region of instructions free of SGPR reads.
3415+
if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
3416+
(I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3417+
WaitInstrs.push_back(&I);
3418+
break;
3419+
default:
3420+
// Update tracking of SGPR reads and writes.
3421+
for (auto &Op : I.operands()) {
3422+
if (!Op.isReg())
33523423
continue;
3353-
// Ignore EXEC
3354-
if (OpReg == AMDGPU::EXEC ||
3355-
OpReg == AMDGPU::EXEC_LO ||
3356-
OpReg == AMDGPU::EXEC_HI)
3424+
3425+
Register Reg = Op.getReg();
3426+
if (IgnoreableSGPR(Reg))
33573427
continue;
3358-
// Ignore all implicit uses except VCC
3359-
if (Op.isImplicit()) {
3360-
if (OpReg == AMDGPU::VCC ||
3361-
OpReg == AMDGPU::VCC_LO ||
3362-
OpReg == AMDGPU::VCC_HI)
3363-
return true;
3428+
if (!IsVCC(Reg)) {
3429+
if (Op.isImplicit())
3430+
continue;
3431+
if (!TRI->isSGPRReg(MRI, Reg))
3432+
continue;
3433+
}
3434+
if (Op.isUse()) {
3435+
HasSGPRRead = true;
33643436
continue;
33653437
}
3366-
if (TRI.isSGPRReg(MRI, OpReg))
3367-
return true;
3368-
} else {
3369-
const MCInstrDesc &InstDesc = I.getDesc();
3370-
const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3371-
if (!TII.isInlineConstant(Op, OpInfo))
3372-
return true;
3438+
3439+
// Stop tracking any SGPRs with writes on the basis that they will
3440+
// already have an appropriate wait inserted afterwards.
3441+
SmallVector<Register, 2> Found;
3442+
for (Register SGPR : State.HazardSGPRs) {
3443+
if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
3444+
Found.push_back(SGPR);
3445+
}
3446+
for (Register SGPR : Found)
3447+
State.HazardSGPRs.erase(SGPR);
33733448
}
3449+
break;
33743450
}
3375-
return false;
33763451
};
33773452

33783453
// Check for hazard
3379-
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
3380-
std::numeric_limits<int>::max())
3454+
if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3455+
MI->getParent(),
3456+
std::next(MI->getReverseIterator())))
33813457
return false;
33823458

3383-
auto NextMI = std::next(MI->getIterator());
3459+
// Compute counter mask
3460+
unsigned DepCtr =
3461+
IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
3462+
: AMDGPU::DepCtr::encodeFieldVaSdst(0))
3463+
: AMDGPU::DepCtr::encodeFieldSaSdst(0);
3464+
3465+
// Try to merge previous waits into this one for regions with no SGPR reads.
3466+
if (!WaitInstrs.empty()) {
3467+
// Note: WaitInstrs contains const pointers, so walk backward from MI to
3468+
// obtain a mutable pointer to each instruction to be merged.
3469+
// This is expected to be a very short walk within the same block.
3470+
SmallVector<MachineInstr *> ToErase;
3471+
unsigned Found = 0;
3472+
for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3473+
End = MI->getParent()->rend();
3474+
Found < WaitInstrs.size() && It != End; ++It) {
3475+
MachineInstr *WaitMI = &*It;
3476+
// Find next wait instruction.
3477+
if (std::as_const(WaitMI) != WaitInstrs[Found])
3478+
continue;
3479+
Found++;
3480+
unsigned WaitMask = WaitMI->getOperand(0).getImm();
3481+
assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3482+
DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3483+
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3484+
AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3485+
DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3486+
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3487+
AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3488+
DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3489+
DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3490+
AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3491+
ToErase.push_back(WaitMI);
3492+
}
3493+
assert(Found == WaitInstrs.size());
3494+
for (MachineInstr *WaitMI : ToErase)
3495+
WaitMI->eraseFromParent();
3496+
}
33843497

3385-
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3498+
// Add s_waitcnt_depctr after SGPR write.
3499+
auto NextMI = std::next(MI->getIterator());
33863500
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
33873501
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3388-
.addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
3502+
.addImm(DepCtr);
33893503

33903504
// SALU write may be s_getpc in a bundle.
33913505
updateGetPCBundle(NewMI);

0 commit comments

Comments
 (0)