@@ -1640,17 +1640,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16401640 (MI.isReturn () && MI.isCall () && !callWaitsOnFunctionEntry (MI))) {
16411641 Wait = Wait.combined (WCG->getAllZeroWaitcnt (/* IncludeVSCnt=*/ false ));
16421642 }
1643- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1644- // stores. In this case it can be useful to send a message to explicitly
1645- // release all VGPRs before the stores have completed, but it is only safe to
1646- // do this if:
1647- // * there are no outstanding scratch stores
1648- // * we are not in Dynamic VGPR mode
1643+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1644+ // Technically the hardware will do this on its own if we don't, but that
1645+ // might cost extra cycles compared to doing it explicitly.
1646+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1647+ // have to wait for outstanding VMEM stores. In this case it can be useful to
1648+ // send a message to explicitly release all VGPRs before the stores have
1649+ // completed, but it is only safe to do this if there are no outstanding
1650+ // scratch stores.
16491651 else if (MI.getOpcode () == AMDGPU::S_ENDPGM ||
16501652 MI.getOpcode () == AMDGPU::S_ENDPGM_SAVED) {
1651- if (ST->getGeneration () >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone () &&
1652- ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1653- !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))
1653+ if (!WCG->isOptNone () &&
1654+ (ST->isDynamicVGPREnabled () ||
1655+ (ST->getGeneration () >= AMDGPUSubtarget::GFX11 &&
1656+ ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1657+ !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))))
16541658 ReleaseVGPRInsts.insert (&MI);
16551659 }
16561660 // Resolve vm waits before gs-done.
@@ -2593,26 +2597,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
25932597 }
25942598 }
25952599
2596- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2597- // instructions.
2600+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2601+ // This is done in different ways depending on how the VGPRs were allocated
2602+ // (i.e. whether we're in dynamic VGPR mode or not).
25982603 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
25992604 // waveslot limited kernel runs slower with the deallocation.
2600- if (!ReleaseVGPRInsts.empty () &&
2601- (MF.getFrameInfo ().hasCalls () ||
2602- ST->getOccupancyWithNumVGPRs (
2603- TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2604- AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2605+ if (ST->isDynamicVGPREnabled ()) {
26052606 for (MachineInstr *MI : ReleaseVGPRInsts) {
2606- if (ST->requiresNopBeforeDeallocVGPRs ()) {
2607- BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2608- TII->get (AMDGPU::S_NOP))
2609- .addImm (0 );
2610- }
26112607 BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2612- TII->get (AMDGPU::S_SENDMSG ))
2613- .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus );
2608+ TII->get (AMDGPU::S_ALLOC_VGPR ))
2609+ .addImm (0 );
26142610 Modified = true ;
26152611 }
2612+ } else {
2613+ if (!ReleaseVGPRInsts.empty () &&
2614+ (MF.getFrameInfo ().hasCalls () ||
2615+ ST->getOccupancyWithNumVGPRs (
2616+ TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2617+ AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2618+ for (MachineInstr *MI : ReleaseVGPRInsts) {
2619+ if (ST->requiresNopBeforeDeallocVGPRs ()) {
2620+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2621+ TII->get (AMDGPU::S_NOP))
2622+ .addImm (0 );
2623+ }
2624+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2625+ TII->get (AMDGPU::S_SENDMSG))
2626+ .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2627+ Modified = true ;
2628+ }
2629+ }
26162630 }
26172631 ReleaseVGPRInsts.clear ();
26182632 PreheadersToFlush.clear ();
0 commit comments