@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16471647 (MI.isReturn () && MI.isCall () && !callWaitsOnFunctionEntry (MI))) {
16481648 Wait = Wait.combined (WCG->getAllZeroWaitcnt (/* IncludeVSCnt=*/ false ));
16491649 }
1650- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651- // stores. In this case it can be useful to send a message to explicitly
1652- // release all VGPRs before the stores have completed, but it is only safe to
1653- // do this if:
1654- // * there are no outstanding scratch stores
1655- // * we are not in Dynamic VGPR mode
1650+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1651+ // Technically the hardware will do this on its own if we don't, but that
1652+ // might cost extra cycles compared to doing it explicitly.
1653+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1654+ // have to wait for outstanding VMEM stores. In this case it can be useful to
1655+ // send a message to explicitly release all VGPRs before the stores have
1656+ // completed, but it is only safe to do this if there are no outstanding
1657+ // scratch stores.
16561658 else if (MI.getOpcode () == AMDGPU::S_ENDPGM ||
16571659 MI.getOpcode () == AMDGPU::S_ENDPGM_SAVED) {
1658- if (ST->getGeneration () >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone () &&
1659- ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1660- !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))
1660+ if (!WCG->isOptNone () &&
1661+ (ST->isDynamicVGPREnabled () ||
1662+ (ST->getGeneration () >= AMDGPUSubtarget::GFX11 &&
1663+ ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1664+ !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))))
16611665 ReleaseVGPRInsts.insert (&MI);
16621666 }
16631667 // Resolve vm waits before gs-done.
@@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
26102614 }
26112615 }
26122616
2613- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2614- // instructions.
2617+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2618+ // This is done in different ways depending on how the VGPRs were allocated
2619+ // (i.e. whether we're in dynamic VGPR mode or not).
26152620 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
26162621 // waveslot limited kernel runs slower with the deallocation.
2617- if (!ReleaseVGPRInsts.empty () &&
2618- (MF.getFrameInfo ().hasCalls () ||
2619- ST->getOccupancyWithNumVGPRs (
2620- TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2621- AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2622+ if (ST->isDynamicVGPREnabled ()) {
26222623 for (MachineInstr *MI : ReleaseVGPRInsts) {
2623- if (ST->requiresNopBeforeDeallocVGPRs ()) {
2624- BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2625- TII->get (AMDGPU::S_NOP))
2626- .addImm (0 );
2627- }
26282624 BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2629- TII->get (AMDGPU::S_SENDMSG ))
2630- .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus );
2625+ TII->get (AMDGPU::S_ALLOC_VGPR ))
2626+ .addImm (0 );
26312627 Modified = true ;
26322628 }
2629+ } else {
2630+ if (!ReleaseVGPRInsts.empty () &&
2631+ (MF.getFrameInfo ().hasCalls () ||
2632+ ST->getOccupancyWithNumVGPRs (
2633+ TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2634+ AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2635+ for (MachineInstr *MI : ReleaseVGPRInsts) {
2636+ if (ST->requiresNopBeforeDeallocVGPRs ()) {
2637+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2638+ TII->get (AMDGPU::S_NOP))
2639+ .addImm (0 );
2640+ }
2641+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2642+ TII->get (AMDGPU::S_SENDMSG))
2643+ .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2644+ Modified = true ;
2645+ }
2646+ }
26332647 }
26342648 ReleaseVGPRInsts.clear ();
26352649 PreheadersToFlush.clear ();
0 commit comments