@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16471647 (MI.isReturn () && MI.isCall () && !callWaitsOnFunctionEntry (MI))) {
16481648 Wait = Wait.combined (WCG->getAllZeroWaitcnt (/* IncludeVSCnt=*/ false ));
16491649 }
1650- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651- // stores. In this case it can be useful to send a message to explicitly
1652- // release all VGPRs before the stores have completed, but it is only safe to
1653- // do this if:
1654- // * there are no outstanding scratch stores
1655- // * we are not in Dynamic VGPR mode
1650+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1651+ // Technically the hardware will do this on its own if we don't, but that
1652+ // might cost extra cycles compared to doing it explicitly.
1653+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1654+ // have to wait for outstanding VMEM stores. In this case it can be useful to
1655+ // send a message to explicitly release all VGPRs before the stores have
1656+ // completed, but it is only safe to do this if there are no outstanding
1657+ // scratch stores.
16561658 else if (MI.getOpcode () == AMDGPU::S_ENDPGM ||
16571659 MI.getOpcode () == AMDGPU::S_ENDPGM_SAVED) {
1658- if (ST->getGeneration () >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone () &&
1659- ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1660- !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))
1660+ if (!WCG->isOptNone () &&
1661+ (ST->isDynamicVGPREnabled () ||
1662+ (ST->getGeneration () >= AMDGPUSubtarget::GFX11 &&
1663+ ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1664+ !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))))
16611665 ReleaseVGPRInsts.insert (&MI);
16621666 }
16631667 // Resolve vm waits before gs-done.
@@ -2611,26 +2615,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
26112615 }
26122616 }
26132617
2614- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2615- // instructions.
2618+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2619+ // This is done in different ways depending on how the VGPRs were allocated
2620+ // (i.e. whether we're in dynamic VGPR mode or not).
26162621 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
26172622 // waveslot limited kernel runs slower with the deallocation.
2618- if (!ReleaseVGPRInsts.empty () &&
2619- (MF.getFrameInfo ().hasCalls () ||
2620- ST->getOccupancyWithNumVGPRs (
2621- TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2622- AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2623+ if (ST->isDynamicVGPREnabled ()) {
26232624 for (MachineInstr *MI : ReleaseVGPRInsts) {
2624- if (ST->requiresNopBeforeDeallocVGPRs ()) {
2625- BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2626- TII->get (AMDGPU::S_NOP))
2627- .addImm (0 );
2628- }
26292625 BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2630- TII->get (AMDGPU::S_SENDMSG ))
2631- .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus );
2626+ TII->get (AMDGPU::S_ALLOC_VGPR ))
2627+ .addImm (0 );
26322628 Modified = true ;
26332629 }
2630+ } else {
2631+ if (!ReleaseVGPRInsts.empty () &&
2632+ (MF.getFrameInfo ().hasCalls () ||
2633+ ST->getOccupancyWithNumVGPRs (
2634+ TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2635+ AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2636+ for (MachineInstr *MI : ReleaseVGPRInsts) {
2637+ if (ST->requiresNopBeforeDeallocVGPRs ()) {
2638+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2639+ TII->get (AMDGPU::S_NOP))
2640+ .addImm (0 );
2641+ }
2642+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2643+ TII->get (AMDGPU::S_SENDMSG))
2644+ .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2645+ Modified = true ;
2646+ }
2647+ }
26342648 }
26352649 ReleaseVGPRInsts.clear ();
26362650 PreheadersToFlush.clear ();
0 commit comments