@@ -1204,6 +1204,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1204
1204
fixGetRegWaitIdle (MI);
1205
1205
if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug ())
1206
1206
fixDsAtomicAsyncBarrierArriveB64 (MI);
1207
+ if (ST.hasScratchBaseForwardingHazard ())
1208
+ fixScratchBaseForwardingHazard (MI);
1207
1209
}
1208
1210
1209
1211
static bool isVCmpXWritesExec (const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -3468,3 +3470,79 @@ bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {
3468
3470
3469
3471
return true ;
3470
3472
}
3473
+
3474
+ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard (MachineInstr *MI) {
3475
+ // No reason to check this in pre-RA scheduling, SGPRs have to be allocated
3476
+ // for hazard to trigger.
3477
+ if (!IsHazardRecognizerMode)
3478
+ return false ;
3479
+
3480
+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
3481
+ const SIInstrInfo *TII = ST.getInstrInfo ();
3482
+ // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU.
3483
+ const int FlatScrBaseWaitStates = 10 ;
3484
+
3485
+ bool ReadsFlatScrLo =
3486
+ MI->readsRegister (AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3487
+ bool ReadsFlatScrHi =
3488
+ MI->readsRegister (AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3489
+ if (isSGetReg (MI->getOpcode ())) {
3490
+ switch (getHWReg (TII, *MI)) {
3491
+ default :
3492
+ break ;
3493
+ case AMDGPU::Hwreg::ID_FLAT_SCR_LO:
3494
+ ReadsFlatScrLo = true ;
3495
+ break ;
3496
+ case AMDGPU::Hwreg::ID_FLAT_SCR_HI:
3497
+ ReadsFlatScrHi = true ;
3498
+ break ;
3499
+ }
3500
+ }
3501
+
3502
+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
3503
+
3504
+ auto IsRegDefHazard = [&](Register Reg) -> bool {
3505
+ DenseSet<const MachineBasicBlock *> Visited;
3506
+ auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) {
3507
+ return MI.modifiesRegister (Reg, TRI);
3508
+ };
3509
+
3510
+ // This literally abuses the idea of waitstates. Instead of waitstates it
3511
+ // returns 1 for SGPR written and 0 otherwise.
3512
+ auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {
3513
+ if (!TII->isSALU (MI) && !TII->isVALU (MI))
3514
+ return 0 ;
3515
+ for (const MachineOperand &MO : MI.all_defs ()) {
3516
+ if (TRI->isSGPRReg (MRI, MO.getReg ()))
3517
+ return 1 ;
3518
+ }
3519
+ return 0 ;
3520
+ };
3521
+
3522
+ auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {
3523
+ if (MI.getOpcode () == AMDGPU::S_WAITCNT_DEPCTR) {
3524
+ unsigned Wait = MI.getOperand (0 ).getImm ();
3525
+ if (AMDGPU::DepCtr::decodeFieldSaSdst (Wait) == 0 &&
3526
+ AMDGPU::DepCtr::decodeFieldVaSdst (Wait) == 0 )
3527
+ return true ;
3528
+ }
3529
+ return SgprWrites >= FlatScrBaseWaitStates;
3530
+ };
3531
+
3532
+ return ::getWaitStatesSince (
3533
+ IsHazardFn, MI->getParent (), std::next (MI->getReverseIterator ()),
3534
+ 0 , IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3535
+ };
3536
+
3537
+ if ((!ReadsFlatScrLo || MRI.isConstantPhysReg (AMDGPU::SGPR102) ||
3538
+ !IsRegDefHazard (AMDGPU::SGPR102)) &&
3539
+ (!ReadsFlatScrHi || MRI.isConstantPhysReg (AMDGPU::SGPR103) ||
3540
+ !IsRegDefHazard (AMDGPU::SGPR103)))
3541
+ return false ;
3542
+
3543
+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
3544
+ TII->get (AMDGPU::S_WAITCNT_DEPCTR))
3545
+ .addImm (AMDGPU::DepCtr::encodeFieldVaSdst (
3546
+ AMDGPU::DepCtr::encodeFieldSaSdst (0 ), 0 ));
3547
+ return true ;
3548
+ }
0 commit comments