@@ -3405,11 +3405,15 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
34053405 }
34063406 };
34073407
3408+ const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst (
3409+ AMDGPU::DepCtr::encodeFieldVaSdst (AMDGPU::DepCtr::encodeFieldVaVcc (0 ), 0 ),
3410+ 0 );
34083411 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
34093412 switch (I.getOpcode ()) {
34103413 case AMDGPU::S_WAITCNT_DEPCTR:
3411- // Record waits within region of instructions free of SGPR reads.
3412- if (!HasSGPRRead && I.getParent () == MI->getParent ())
3414+ // Record mergable waits within region of instructions free of SGPR reads.
3415+ if (!HasSGPRRead && I.getParent () == MI->getParent () && !I.isBundled () &&
3416+ (I.getOperand (0 ).getImm () & ConstantMaskBits) == ConstantMaskBits)
34133417 WaitInstrs.push_back (&I);
34143418 break ;
34153419 default :
@@ -3459,21 +3463,22 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
34593463 : AMDGPU::DepCtr::encodeFieldSaSdst (0 );
34603464
34613465 // Try to merge previous waits into this one for regions with no SGPR reads.
3462- if (WaitInstrs.size ()) {
3463- const unsigned ConstantBits = AMDGPU::DepCtr::encodeFieldSaSdst (
3464- AMDGPU::DepCtr::encodeFieldVaSdst (AMDGPU::DepCtr::encodeFieldVaVcc (0 ),
3465- 0 ),
3466- 0 );
3467-
3468- for (const MachineInstr *Instr : WaitInstrs) {
3469- // Don't touch bundled waits.
3470- if (Instr->isBundled ())
3466+ if (!WaitInstrs.empty ()) {
3467+ // Note: WaitInstrs contains const pointers, so walk backward from MI to
3468+ // obtain a mutable pointer to each instruction to be merged.
3469+ // This is expected to be a very short walk within the same block.
3470+ SmallVector<MachineInstr *> ToErase;
3471+ unsigned Found = 0 ;
3472+ for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator (),
3473+ End = MI->getParent ()->rend ();
3474+ Found < WaitInstrs.size () && It != End; ++It) {
3475+ MachineInstr *WaitMI = &*It;
3476+ // Find next wait instruction.
3477+ if (std::as_const (WaitMI) != WaitInstrs[Found])
34713478 continue ;
3472- MachineInstr *WaitMI = const_cast <MachineInstr *>(Instr) ;
3479+ Found++ ;
34733480 unsigned WaitMask = WaitMI->getOperand (0 ).getImm ();
3474- // Only work with counters related to this hazard.
3475- if ((WaitMask & ConstantBits) != ConstantBits)
3476- continue ;
3481+ assert ((WaitMask & ConstantMaskBits) == ConstantMaskBits);
34773482 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst (
34783483 DepCtr, std::min (AMDGPU::DepCtr::decodeFieldSaSdst (WaitMask),
34793484 AMDGPU::DepCtr::decodeFieldSaSdst (DepCtr)));
@@ -3483,8 +3488,11 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
34833488 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc (
34843489 DepCtr, std::min (AMDGPU::DepCtr::decodeFieldVaVcc (WaitMask),
34853490 AMDGPU::DepCtr::decodeFieldVaVcc (DepCtr)));
3486- WaitMI-> eraseFromParent ( );
3491+ ToErase. push_back (WaitMI );
34873492 }
3493+ assert (Found == WaitInstrs.size ());
3494+ for (MachineInstr *WaitMI : ToErase)
3495+ WaitMI->eraseFromParent ();
34883496 }
34893497
34903498 // Add s_waitcnt_depctr after SGPR write.
0 commit comments