Skip to content

Commit c29d820

Browse files
committed
[AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode
In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0` before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction. This patch extends `si-insert-waitcnts` to deallocate the VGPRs via a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
1 parent b2a7bdc commit c29d820

File tree

2 files changed

+393
-23
lines changed

2 files changed

+393
-23
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16471647
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
16481648
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
16491649
}
1650-
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651-
// stores. In this case it can be useful to send a message to explicitly
1652-
// release all VGPRs before the stores have completed, but it is only safe to
1653-
// do this if:
1654-
// * there are no outstanding scratch stores
1655-
// * we are not in Dynamic VGPR mode
1650+
// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1651+
// Technically the hardware will do this on its own if we don't, but that
1652+
// might cost extra cycles compared to doing it explicitly.
1653+
// When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1654+
// have to wait for outstanding VMEM stores. In this case it can be useful to
1655+
// send a message to explicitly release all VGPRs before the stores have
1656+
// completed, but it is only safe to do this if there are no outstanding
1657+
// scratch stores.
16561658
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
16571659
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1658-
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1659-
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1660-
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1660+
if (!WCG->isOptNone() &&
1661+
(ST->isDynamicVGPREnabled() ||
1662+
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1663+
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1664+
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
16611665
ReleaseVGPRInsts.insert(&MI);
16621666
}
16631667
// Resolve vm waits before gs-done.
@@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
26102614
}
26112615
}
26122616

2613-
// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2614-
// instructions.
2617+
// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2618+
// This is done in different ways depending on how the VGPRs were allocated
2619+
// (i.e. whether we're in dynamic VGPR mode or not).
26152620
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
26162621
// waveslot limited kernel runs slower with the deallocation.
2617-
if (!ReleaseVGPRInsts.empty() &&
2618-
(MF.getFrameInfo().hasCalls() ||
2619-
ST->getOccupancyWithNumVGPRs(
2620-
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2621-
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2622+
if (ST->isDynamicVGPREnabled()) {
26222623
for (MachineInstr *MI : ReleaseVGPRInsts) {
2623-
if (ST->requiresNopBeforeDeallocVGPRs()) {
2624-
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2625-
TII->get(AMDGPU::S_NOP))
2626-
.addImm(0);
2627-
}
26282624
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2629-
TII->get(AMDGPU::S_SENDMSG))
2630-
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2625+
TII->get(AMDGPU::S_ALLOC_VGPR))
2626+
.addImm(0);
26312627
Modified = true;
26322628
}
2629+
} else {
2630+
if (!ReleaseVGPRInsts.empty() &&
2631+
(MF.getFrameInfo().hasCalls() ||
2632+
ST->getOccupancyWithNumVGPRs(
2633+
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2634+
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2635+
for (MachineInstr *MI : ReleaseVGPRInsts) {
2636+
if (ST->requiresNopBeforeDeallocVGPRs()) {
2637+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2638+
TII->get(AMDGPU::S_NOP))
2639+
.addImm(0);
2640+
}
2641+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2642+
TII->get(AMDGPU::S_SENDMSG))
2643+
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2644+
Modified = true;
2645+
}
2646+
}
26332647
}
26342648
ReleaseVGPRInsts.clear();
26352649
PreheadersToFlush.clear();

0 commit comments

Comments
 (0)