@@ -844,17 +844,62 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
844844 }
845845 assert (ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
846846
847- if (hasFP (MF)) {
847+ unsigned Offset = FrameInfo.getStackSize () * getScratchScaleFactor (ST);
848+ if (!mayReserveScratchForCWSR (MF)) {
849+ if (hasFP (MF)) {
850+ Register FPReg = MFI->getFrameOffsetReg ();
851+ assert (FPReg != AMDGPU::FP_REG);
852+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_MOV_B32), FPReg).addImm (0 );
853+ }
854+
855+ if (requiresStackPointerReference (MF)) {
856+ Register SPReg = MFI->getStackPtrOffsetReg ();
857+ assert (SPReg != AMDGPU::SP_REG);
858+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_MOV_B32), SPReg).addImm (Offset);
859+ }
860+ } else {
861+ // We need to check if we're on a compute queue - if we are, then the CWSR
862+ // trap handler may need to store some VGPRs on the stack. The first VGPR
863+ // block is saved separately, so we only need to allocate space for any
864+ // additional VGPR blocks used. For now, we will make sure there's enough
865+ // room for the theoretical maximum number of VGPRs that can be allocated.
866+ // FIXME: Figure out if the shader uses fewer VGPRs in practice.
867+ assert (hasFP (MF));
848868 Register FPReg = MFI->getFrameOffsetReg ();
849869 assert (FPReg != AMDGPU::FP_REG);
850- BuildMI (MBB, I, DL, TII->get (AMDGPU::S_MOV_B32), FPReg).addImm (0 );
851- }
852-
853- if (requiresStackPointerReference (MF)) {
854- Register SPReg = MFI->getStackPtrOffsetReg ();
855- assert (SPReg != AMDGPU::SP_REG);
856- BuildMI (MBB, I, DL, TII->get (AMDGPU::S_MOV_B32), SPReg)
857- .addImm (FrameInfo.getStackSize () * getScratchScaleFactor (ST));
870+ unsigned VGPRSize =
871+ llvm::alignTo ((ST.getAddressableNumVGPRs () -
872+ AMDGPU::IsaInfo::getVGPRAllocGranule (&ST)) *
873+ 4 ,
874+ FrameInfo.getMaxAlign ());
875+ MFI->setScratchReservedForDynamicVGPRs (VGPRSize);
876+
877+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_GETREG_B32), FPReg)
878+ .addImm (AMDGPU::Hwreg::HwregEncoding::encode (
879+ AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2 ));
880+ // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
881+ // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
882+ // SCC, so we need to check for 0 manually.
883+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_CMP_LG_U32)).addImm (0 ).addReg (FPReg);
884+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_CMOVK_I32), FPReg).addImm (VGPRSize);
885+ if (requiresStackPointerReference (MF)) {
886+ Register SPReg = MFI->getStackPtrOffsetReg ();
887+ assert (SPReg != AMDGPU::SP_REG);
888+
889+ // If at least one of the constants can be inlined, then we can use
890+ // s_cselect. Otherwise, use a mov and cmovk.
891+ if (AMDGPU::isInlinableLiteral32 (Offset, ST.hasInv2PiInlineImm ()) ||
892+ AMDGPU::isInlinableLiteral32 (Offset + VGPRSize,
893+ ST.hasInv2PiInlineImm ())) {
894+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_CSELECT_B32), SPReg)
895+ .addImm (Offset + VGPRSize)
896+ .addImm (Offset);
897+ } else {
898+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_MOV_B32), SPReg).addImm (Offset);
899+ BuildMI (MBB, I, DL, TII->get (AMDGPU::S_CMOVK_I32), SPReg)
900+ .addImm (Offset + VGPRSize);
901+ }
902+ }
858903 }
859904
860905 bool NeedsFlatScratchInit =
@@ -2182,9 +2227,17 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
21822227 return frameTriviallyRequiresSP (MFI) || MFI.isFrameAddressTaken () ||
21832228 MF.getSubtarget <GCNSubtarget>().getRegisterInfo ()->hasStackRealignment (
21842229 MF) ||
2230+ mayReserveScratchForCWSR (MF) ||
21852231 MF.getTarget ().Options .DisableFramePointerElim (MF);
21862232}
21872233
2234+ bool SIFrameLowering::mayReserveScratchForCWSR (
2235+ const MachineFunction &MF) const {
2236+ return MF.getSubtarget <GCNSubtarget>().isDynamicVGPREnabled () &&
2237+ AMDGPU::isEntryFunctionCC (MF.getFunction ().getCallingConv ()) &&
2238+ AMDGPU::isCompute (MF.getFunction ().getCallingConv ());
2239+ }
2240+
21882241// This is essentially a reduced version of hasFP for entry functions. Since the
21892242// stack pointer is known 0 on entry to kernels, we never really need an FP
21902243// register. We may need to initialize the stack pointer depending on the frame
0 commit comments