@@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
32673267 return false ;
32683268 assert (!ST.hasExtendedWaitCounts ());
32693269
3270- if (!ST.isWave64 () || !SIInstrInfo::isSALU (*MI))
3270+ if (!ST.isWave64 ())
3271+ return false ;
3272+
3273+ const bool IsSALU = SIInstrInfo::isSALU (*MI);
3274+ const bool IsVALU = SIInstrInfo::isVALU (*MI);
3275+ if (!IsSALU && !IsVALU)
32713276 return false ;
32723277
32733278 // The hazard sequence is three instructions:
32743279 // 1. VALU reads SGPR as mask
3275- // 2. SALU writes SGPR
3276- // 3. SALU reads SGPR
3277- // The hazard can expire if the distance between 2 and 3 is sufficient.
3278- // In practice this happens <10% of the time, hence this always assumes
3279- // the hazard exists if 1 and 2 are present to avoid searching.
3280+ // 2. VALU/SALU writes SGPR
3281+ // 3. VALU/SALU reads SGPR
3282+ // The hazard can expire if the distance between 2 and 3 is sufficient,
3283+ // or (2) is VALU and (3) is SALU.
3284+ // In practice this happens <10% of the time, hence always assume the hazard
3285+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
32803286
3281- const MachineOperand *SDSTOp = TII.getNamedOperand (*MI, AMDGPU::OpName::sdst);
3282- if (!SDSTOp || !SDSTOp->isReg ())
3283- return false ;
3287+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
3288+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
3289+
3290+ auto IgnoreableSGPR = [](const Register Reg) {
3291+ switch (Reg) {
3292+ case AMDGPU::EXEC:
3293+ case AMDGPU::EXEC_LO:
3294+ case AMDGPU::EXEC_HI:
3295+ case AMDGPU::M0:
3296+ case AMDGPU::SGPR_NULL:
3297+ case AMDGPU::SGPR_NULL64:
3298+ case AMDGPU::SCC:
3299+ return true ;
3300+ default :
3301+ return false ;
3302+ }
3303+ };
3304+ auto IsVCC = [](const Register Reg) {
3305+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3306+ };
3307+
3308+ struct StateType {
3309+ SmallSet<Register, 2 > HazardSGPRs;
3310+
3311+ static unsigned getHashValue (const StateType &State) {
3312+ return hash_combine_range (State.HazardSGPRs );
3313+ }
3314+ static bool isEqual (const StateType &LHS, const StateType &RHS) {
3315+ return LHS.HazardSGPRs == RHS.HazardSGPRs ;
3316+ }
3317+ };
3318+
3319+ SmallVector<const MachineInstr *> WaitInstrs;
3320+ bool HasSGPRRead = false ;
3321+ StateType InitialState;
3322+
3323+ // Look for SGPR write.
3324+ MachineOperand *HazardDef = nullptr ;
3325+ for (MachineOperand &Op : MI->operands ()) {
3326+ if (!Op.isReg ())
3327+ continue ;
3328+ if (Op.isDef () && HazardDef)
3329+ continue ;
3330+
3331+ Register Reg = Op.getReg ();
3332+ if (IgnoreableSGPR (Reg))
3333+ continue ;
3334+ if (!IsVCC (Reg)) {
3335+ if (Op.isImplicit ())
3336+ continue ;
3337+ if (!TRI->isSGPRReg (MRI, Reg))
3338+ continue ;
3339+ }
3340+ // Also check for SGPR reads.
3341+ if (Op.isUse ()) {
3342+ HasSGPRRead = true ;
3343+ continue ;
3344+ }
3345+
3346+ assert (!HazardDef);
3347+ HazardDef = &Op;
3348+ }
32843349
3285- const Register HazardReg = SDSTOp->getReg ();
3286- if (HazardReg == AMDGPU::EXEC ||
3287- HazardReg == AMDGPU::EXEC_LO ||
3288- HazardReg == AMDGPU::EXEC_HI ||
3289- HazardReg == AMDGPU::M0)
3350+ if (!HazardDef)
32903351 return false ;
32913352
3292- auto IsHazardFn = [HazardReg, this ](const MachineInstr &I) {
3353+ // Setup to track writes to individual SGPRs
3354+ const Register HazardReg = HazardDef->getReg ();
3355+ if (AMDGPU::SReg_32RegClass.contains (HazardReg)) {
3356+ InitialState.HazardSGPRs .insert (HazardReg);
3357+ } else {
3358+ assert (AMDGPU::SReg_64RegClass.contains (HazardReg));
3359+ InitialState.HazardSGPRs .insert (TRI->getSubReg (HazardReg, AMDGPU::sub0));
3360+ InitialState.HazardSGPRs .insert (TRI->getSubReg (HazardReg, AMDGPU::sub1));
3361+ }
3362+
3363+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
3364+ if (State.HazardSGPRs .empty ())
3365+ return HazardExpired;
3366+
32933367 switch (I.getOpcode ()) {
32943368 case AMDGPU::V_ADDC_U32_e32:
32953369 case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33043378 case AMDGPU::V_SUBB_U32_e32:
33053379 case AMDGPU::V_SUBB_U32_dpp:
33063380 case AMDGPU::V_SUBBREV_U32_e32:
3307- case AMDGPU::V_SUBBREV_U32_dpp:
3381+ case AMDGPU::V_SUBBREV_U32_dpp: {
33083382 // These implicitly read VCC as mask source.
3309- return HazardReg == AMDGPU::VCC ||
3310- HazardReg == AMDGPU::VCC_LO ||
3311- HazardReg == AMDGPU::VCC_HI;
3383+ return IsVCC (HazardReg) ? HazardFound : NoHazardFound;
3384+ }
33123385 case AMDGPU::V_ADDC_U32_e64:
33133386 case AMDGPU::V_ADDC_U32_e64_dpp:
33143387 case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
33243397 // Only check mask register overlaps.
33253398 const MachineOperand *SSRCOp = TII.getNamedOperand (I, AMDGPU::OpName::src2);
33263399 assert (SSRCOp);
3327- return TRI.regsOverlap (SSRCOp->getReg (), HazardReg);
3400+ bool Result = TRI->regsOverlap (SSRCOp->getReg (), HazardReg);
3401+ return Result ? HazardFound : NoHazardFound;
33283402 }
33293403 default :
3330- return false ;
3404+ return NoHazardFound ;
33313405 }
33323406 };
33333407
3334- const MachineRegisterInfo &MRI = MF.getRegInfo ();
3335- auto IsExpiredFn = [&MRI, this ](const MachineInstr &I, int ) {
3336- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
3337- if (I.getOpcode () == AMDGPU::S_WAITCNT_DEPCTR &&
3338- AMDGPU::DepCtr::decodeFieldSaSdst (I.getOperand (0 ).getImm ()) == 0 )
3339- return true ;
3340-
3341- // VALU access to any SGPR or literal constant other than HazardReg
3342- // mitigates hazard. No need to check HazardReg here as this will
3343- // only be called when !IsHazardFn.
3344- if (!SIInstrInfo::isVALU (I))
3345- return false ;
3346- for (int OpNo = 0 , End = I.getNumOperands (); OpNo < End; ++OpNo) {
3347- const MachineOperand &Op = I.getOperand (OpNo);
3348- if (Op.isReg ()) {
3349- Register OpReg = Op.getReg ();
3350- // Only consider uses
3351- if (!Op.isUse ())
3408+ const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst (
3409+ AMDGPU::DepCtr::encodeFieldVaSdst (AMDGPU::DepCtr::encodeFieldVaVcc (0 ), 0 ),
3410+ 0 );
3411+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
3412+ switch (I.getOpcode ()) {
3413+ case AMDGPU::S_WAITCNT_DEPCTR:
3414+ // Record mergable waits within region of instructions free of SGPR reads.
3415+ if (!HasSGPRRead && I.getParent () == MI->getParent () && !I.isBundled () &&
3416+ (I.getOperand (0 ).getImm () & ConstantMaskBits) == ConstantMaskBits)
3417+ WaitInstrs.push_back (&I);
3418+ break ;
3419+ default :
3420+ // Update tracking of SGPR reads and writes.
3421+ for (auto &Op : I.operands ()) {
3422+ if (!Op.isReg ())
33523423 continue ;
3353- // Ignore EXEC
3354- if (OpReg == AMDGPU::EXEC ||
3355- OpReg == AMDGPU::EXEC_LO ||
3356- OpReg == AMDGPU::EXEC_HI)
3424+
3425+ Register Reg = Op.getReg ();
3426+ if (IgnoreableSGPR (Reg))
33573427 continue ;
3358- // Ignore all implicit uses except VCC
3359- if (Op.isImplicit ()) {
3360- if (OpReg == AMDGPU::VCC ||
3361- OpReg == AMDGPU::VCC_LO ||
3362- OpReg == AMDGPU::VCC_HI)
3363- return true ;
3428+ if (!IsVCC (Reg)) {
3429+ if (Op.isImplicit ())
3430+ continue ;
3431+ if (!TRI->isSGPRReg (MRI, Reg))
3432+ continue ;
3433+ }
3434+ if (Op.isUse ()) {
3435+ HasSGPRRead = true ;
33643436 continue ;
33653437 }
3366- if (TRI.isSGPRReg (MRI, OpReg))
3367- return true ;
3368- } else {
3369- const MCInstrDesc &InstDesc = I.getDesc ();
3370- const MCOperandInfo &OpInfo = InstDesc.operands ()[OpNo];
3371- if (!TII.isInlineConstant (Op, OpInfo))
3372- return true ;
3438+
3439+ // Stop tracking any SGPRs with writes on the basis that they will
3440+ // already have an appropriate wait inserted afterwards.
3441+ SmallVector<Register, 2 > Found;
3442+ for (Register SGPR : State.HazardSGPRs ) {
3443+ if (Reg == SGPR || TRI->regsOverlap (Reg, SGPR))
3444+ Found.push_back (SGPR);
3445+ }
3446+ for (Register SGPR : Found)
3447+ State.HazardSGPRs .erase (SGPR);
33733448 }
3449+ break ;
33743450 }
3375- return false ;
33763451 };
33773452
33783453 // Check for hazard
3379- if (::getWaitStatesSince (IsHazardFn, MI, IsExpiredFn) ==
3380- std::numeric_limits<int >::max ())
3454+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
3455+ MI->getParent (),
3456+ std::next (MI->getReverseIterator ())))
33813457 return false ;
33823458
3383- auto NextMI = std::next (MI->getIterator ());
3459+ // Compute counter mask
3460+ unsigned DepCtr =
3461+ IsVALU ? (IsVCC (HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc (0 )
3462+ : AMDGPU::DepCtr::encodeFieldVaSdst (0 ))
3463+ : AMDGPU::DepCtr::encodeFieldSaSdst (0 );
3464+
3465+ // Try to merge previous waits into this one for regions with no SGPR reads.
3466+ if (!WaitInstrs.empty ()) {
3467+ // Note: WaitInstrs contains const pointers, so walk backward from MI to
3468+ // obtain a mutable pointer to each instruction to be merged.
3469+ // This is expected to be a very short walk within the same block.
3470+ SmallVector<MachineInstr *> ToErase;
3471+ unsigned Found = 0 ;
3472+ for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator (),
3473+ End = MI->getParent ()->rend ();
3474+ Found < WaitInstrs.size () && It != End; ++It) {
3475+ MachineInstr *WaitMI = &*It;
3476+ // Find next wait instruction.
3477+ if (std::as_const (WaitMI) != WaitInstrs[Found])
3478+ continue ;
3479+ Found++;
3480+ unsigned WaitMask = WaitMI->getOperand (0 ).getImm ();
3481+ assert ((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3482+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst (
3483+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldSaSdst (WaitMask),
3484+ AMDGPU::DepCtr::decodeFieldSaSdst (DepCtr)));
3485+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst (
3486+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldVaSdst (WaitMask),
3487+ AMDGPU::DepCtr::decodeFieldVaSdst (DepCtr)));
3488+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc (
3489+ DepCtr, std::min (AMDGPU::DepCtr::decodeFieldVaVcc (WaitMask),
3490+ AMDGPU::DepCtr::decodeFieldVaVcc (DepCtr)));
3491+ ToErase.push_back (WaitMI);
3492+ }
3493+ assert (Found == WaitInstrs.size ());
3494+ for (MachineInstr *WaitMI : ToErase)
3495+ WaitMI->eraseFromParent ();
3496+ }
33843497
3385- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
3498+ // Add s_waitcnt_depctr after SGPR write.
3499+ auto NextMI = std::next (MI->getIterator ());
33863500 auto NewMI = BuildMI (*MI->getParent (), NextMI, MI->getDebugLoc (),
33873501 TII.get (AMDGPU::S_WAITCNT_DEPCTR))
3388- .addImm (AMDGPU:: DepCtr::encodeFieldSaSdst ( 0 ) );
3502+ .addImm (DepCtr);
33893503
33903504 // SALU write may be s_getpc in a bundle.
33913505 updateGetPCBundle (NewMI);
0 commit comments