@@ -3267,29 +3267,106 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
32673267 return false ;
32683268 assert (!ST.hasExtendedWaitCounts ());
32693269
3270- if (!ST.isWave64 () || !SIInstrInfo::isSALU (*MI))
3270+ if (!ST.isWave64 ())
3271+ return false ;
3272+
3273+ const bool IsSALU = SIInstrInfo::isSALU (*MI);
3274+ const bool IsVALU = SIInstrInfo::isVALU (*MI);
3275+ if (!IsSALU && !IsVALU)
32713276 return false ;
32723277
32733278 // The hazard sequence is three instructions:
32743279 // 1. VALU reads SGPR as mask
3275- // 2. SALU writes SGPR
3276- // 3. SALU reads SGPR
3277- // The hazard can expire if the distance between 2 and 3 is sufficient.
3278- // In practice this happens <10% of the time, hence this always assumes
3279- // the hazard exists if 1 and 2 are present to avoid searching.
3280+ // 2. VALU/SALU writes SGPR
3281+ // 3. VALU/SALU reads SGPR
3282+ // The hazard can expire if the distance between 2 and 3 is sufficient,
3283+ // or (2) is VALU and (3) is SALU.
3284+ // In practice this happens <10% of the time, hence always assume the hazard
3285+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
32803286
3281- const MachineOperand *SDSTOp = TII.getNamedOperand (*MI, AMDGPU::OpName::sdst);
3282- if (!SDSTOp || !SDSTOp->isReg ())
3283- return false ;
3287+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
3288+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
3289+
3290+ auto IgnoreableSGPR = [](const Register Reg) {
3291+ switch (Reg) {
3292+ case AMDGPU::EXEC:
3293+ case AMDGPU::EXEC_LO:
3294+ case AMDGPU::EXEC_HI:
3295+ case AMDGPU::M0:
3296+ case AMDGPU::SGPR_NULL:
3297+ case AMDGPU::SGPR_NULL64:
3298+ case AMDGPU::SCC:
3299+ return true ;
3300+ default :
3301+ return false ;
3302+ }
3303+ };
3304+ auto IsVCC = [](const Register Reg) {
3305+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3306+ };
3307+
3308+ struct StateType {
3309+ SmallSet<Register, 2 > HazardSGPRs;
3310+
3311+ static unsigned getHashValue (const StateType &State) {
3312+ return hash_combine_range (State.HazardSGPRs );
3313+ }
3314+ static bool isEqual (const StateType &LHS, const StateType &RHS) {
3315+ return LHS.HazardSGPRs == RHS.HazardSGPRs ;
3316+ }
3317+ };
3318+
3319+ SmallVector<const MachineInstr *> WaitInstrs;
3320+ bool HasSGPRRead = false ;
3321+ StateType InitialState;
3322+
3323+ // Look for SGPR write.
3324+ MachineOperand *HazardDef = nullptr ;
3325+ for (MachineOperand &Op : MI->operands ()) {
3326+ if (!Op.isReg ())
3327+ continue ;
3328+ if (Op.isDef () && HazardDef)
3329+ continue ;
3330+
3331+ Register Reg = Op.getReg ();
3332+ if (IgnoreableSGPR (Reg))
3333+ continue ;
3334+ if (!IsVCC (Reg)) {
3335+ if (Op.isImplicit ())
3336+ continue ;
3337+ if (!TRI->isSGPRReg (MRI, Reg))
3338+ continue ;
3339+ }
3340+ // Also check for SGPR reads.
3341+ if (Op.isUse ()) {
3342+ HasSGPRRead = true ;
3343+ continue ;
3344+ }
32843345
3285- const Register HazardReg = SDSTOp-> getReg ( );
3286- if (HazardReg == AMDGPU::EXEC ||
3287- HazardReg == AMDGPU::EXEC_LO ||
3288- HazardReg == AMDGPU::EXEC_HI ||
3289- HazardReg == AMDGPU::M0 )
3346+ assert (!HazardDef );
3347+ HazardDef = &Op;
3348+ }
3349+
3350+ if (!HazardDef )
32903351 return false ;
32913352
3292- auto IsHazardFn = [HazardReg, this ](const MachineInstr &I) {
3353+ // Setup to track writes to individual SGPRs
3354+ const Register HazardReg = HazardDef->getReg ();
3355+ if (AMDGPU::SReg_32RegClass.contains (HazardReg)) {
3356+ InitialState.HazardSGPRs .insert (HazardReg);
3357+ } else if (IsVCC (HazardReg)) {
3358+ InitialState.HazardSGPRs .insert (AMDGPU::VCC_LO);
3359+ InitialState.HazardSGPRs .insert (AMDGPU::VCC_HI);
3360+ } else {
3361+ assert (AMDGPU::SReg_64RegClass.contains (HazardReg));
3362+ InitialState.HazardSGPRs .insert (TRI->getSubReg (HazardReg, AMDGPU::sub0));
3363+ InitialState.HazardSGPRs .insert (TRI->getSubReg (HazardReg, AMDGPU::sub1));
3364+ }
3365+
3366+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3367+ if (State.HazardSGPRs .empty ())
3368+ return HazardExpired;
3369+
32933370 switch (I.getOpcode ()) {
32943371 case AMDGPU::V_ADDC_U32_e32:
32953372 case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3381,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33043381 case AMDGPU::V_SUBB_U32_e32:
33053382 case AMDGPU::V_SUBB_U32_dpp:
33063383 case AMDGPU::V_SUBBREV_U32_e32:
3307- case AMDGPU::V_SUBBREV_U32_dpp:
3384+ case AMDGPU::V_SUBBREV_U32_dpp: {
33083385 // These implicitly read VCC as mask source.
3309- return HazardReg == AMDGPU::VCC ||
3310- HazardReg == AMDGPU::VCC_LO ||
3311- HazardReg == AMDGPU::VCC_HI;
3386+ return IsVCC (HazardReg) ? HazardFound : NoHazardFound;
3387+ }
33123388 case AMDGPU::V_ADDC_U32_e64:
33133389 case AMDGPU::V_ADDC_U32_e64_dpp:
33143390 case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3400,101 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33243400 // Only check mask register overlaps.
33253401 const MachineOperand *SSRCOp = TII.getNamedOperand (I, AMDGPU::OpName::src2);
33263402 assert (SSRCOp);
3327- return TRI.regsOverlap (SSRCOp->getReg (), HazardReg);
3403+ bool Result = TRI->regsOverlap (SSRCOp->getReg (), HazardReg);
3404+ return Result ? HazardFound : NoHazardFound;
33283405 }
33293406 default :
3330- return false ;
3407+ return NoHazardFound ;
33313408 }
33323409 };
33333410
3334- const MachineRegisterInfo &MRI = MF.getRegInfo ();
3335- auto IsExpiredFn = [&MRI, this ](const MachineInstr &I, int ) {
3336- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3337- if (I.getOpcode () == AMDGPU::S_WAITCNT_DEPCTR &&
3338- AMDGPU::DepCtr::decodeFieldSaSdst (I.getOperand (0 ).getImm ()) == 0 )
3339- return true ;
3340-
3341- // VALU access to any SGPR or literal constant other than HazardReg
3342- // mitigates hazard. No need to check HazardReg here as this will
3343- // only be called when !IsHazardFn.
3344- if (!SIInstrInfo::isVALU (I))
3345- return false ;
3346- for (int OpNo = 0 , End = I.getNumOperands (); OpNo < End; ++OpNo) {
3347- const MachineOperand &Op = I.getOperand (OpNo);
3348- if (Op.isReg ()) {
3349- Register OpReg = Op.getReg ();
3350- // Only consider uses
3351- if (!Op.isUse ())
3411+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3412+ switch (I.getOpcode ()) {
3413+ case AMDGPU::S_WAITCNT_DEPCTR:
3414+ // Record waits within region of instructions free of SGPR reads.
3415+ if (!HasSGPRRead && I.getParent () == MI->getParent ())
3416+ WaitInstrs.push_back (&I);
3417+ break ;
3418+ default :
3419+ // Update tracking of SGPR reads and writes.
3420+ for (auto &Op : I.operands ()) {
3421+ if (!Op.isReg ())
33523422 continue ;
3353- // Ignore EXEC
3354- if (OpReg == AMDGPU::EXEC ||
3355- OpReg == AMDGPU::EXEC_LO ||
3356- OpReg == AMDGPU::EXEC_HI)
3423+
3424+ Register Reg = Op.getReg ();
3425+ if (IgnoreableSGPR (Reg))
33573426 continue ;
3358- // Ignore all implicit uses except VCC
3359- if (Op.isImplicit ()) {
3360- if (OpReg == AMDGPU::VCC ||
3361- OpReg == AMDGPU::VCC_LO ||
3362- OpReg == AMDGPU::VCC_HI)
3363- return true ;
3427+ if (!IsVCC (Reg)) {
3428+ if (Op.isImplicit ())
3429+ continue ;
3430+ if (!TRI->isSGPRReg (MRI, Reg))
3431+ continue ;
3432+ }
3433+ if (Op.isUse ()) {
3434+ HasSGPRRead = true ;
33643435 continue ;
33653436 }
3366- if (TRI.isSGPRReg (MRI, OpReg))
3367- return true ;
3368- } else {
3369- const MCInstrDesc &InstDesc = I.getDesc ();
3370- const MCOperandInfo &OpInfo = InstDesc.operands ()[OpNo];
3371- if (!TII.isInlineConstant (Op, OpInfo))
3372- return true ;
3437+
3438+ // Stop tracking any SGPRs with writes on the basis that they will
3439+ // already have an appropriate wait inserted afterwards.
3440+ SmallVector<Register, 2 > Found;
3441+ for (Register SGPR : State.HazardSGPRs ) {
3442+ if (Reg == SGPR || TRI->regsOverlap (Reg, SGPR))
3443+ Found.push_back (SGPR);
3444+ }
3445+ for (Register SGPR : Found)
3446+ State.HazardSGPRs .erase (SGPR);
33733447 }
3448+ break ;
33743449 }
3375- return false ;
33763450 };
33773451
33783452 // Check for hazard
3379- if (::getWaitStatesSince (IsHazardFn, MI, IsExpiredFn) ==
3380- std::numeric_limits<int >::max ())
3453+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3454+ MI->getParent (),
3455+ std::next (MI->getReverseIterator ())))
33813456 return false ;
33823457
3383- auto NextMI = std::next (MI->getIterator ());
3458+ // Compute counter mask
3459+ unsigned DepCtr =
3460+ IsVALU ? (IsVCC (HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc (0 )
3461+ : AMDGPU::DepCtr::encodeFieldVaSdst (0 ))
3462+ : AMDGPU::DepCtr::encodeFieldSaSdst (0 );
3463+
3464+ // Try to merge previous waits into this one for regions with no SGPR reads.
3465+ if (WaitInstrs.size ()) {
3466+ const unsigned ConstantBits = AMDGPU::DepCtr::encodeFieldSaSdst (
3467+ AMDGPU::DepCtr::encodeFieldVaSdst (AMDGPU::DepCtr::encodeFieldVaVcc (0 ),
3468+ 0 ),
3469+ 0 );
3470+
3471+ for (const MachineInstr *Instr : WaitInstrs) {
3472+ // Don't touch bundled waits.
3473+ if (Instr->isBundled ())
3474+ continue ;
3475+ MachineInstr *WaitMI = const_cast <MachineInstr *>(Instr);
3476+ unsigned WaitMask = WaitMI->getOperand (0 ).getImm ();
3477+ // Only work with counters related to this hazard.
3478+ if ((WaitMask & ConstantBits) != ConstantBits)
3479+ continue ;
3480+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst (
3481+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldSaSdst (WaitMask),
3482+ AMDGPU::DepCtr::decodeFieldSaSdst (DepCtr)));
3483+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst (
3484+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldVaSdst (WaitMask),
3485+ AMDGPU::DepCtr::decodeFieldVaSdst (DepCtr)));
3486+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc (
3487+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldVaVcc (WaitMask),
3488+ AMDGPU::DepCtr::decodeFieldVaVcc (DepCtr)));
3489+ WaitMI->eraseFromParent ();
3490+ }
3491+ }
33843492
3385- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3493+ // Add s_waitcnt_depctr after SGPR write.
3494+ auto NextMI = std::next (MI->getIterator ());
33863495 auto NewMI = BuildMI (*MI->getParent (), NextMI, MI->getDebugLoc (),
33873496 TII.get (AMDGPU::S_WAITCNT_DEPCTR))
3388- .addImm (AMDGPU:: DepCtr::encodeFieldSaSdst ( 0 ) );
3497+ .addImm (DepCtr);
33893498
33903499 // SALU write may be s_getpc in a bundle.
33913500 updateGetPCBundle (NewMI);
0 commit comments