@@ -1847,6 +1847,110 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
18471847 }
18481848}
18491849
1850+ static void assignSlotsUsingVGPRBlocks (MachineFunction &MF,
1851+ const GCNSubtarget &ST,
1852+ const TargetRegisterInfo *TRI,
1853+ std::vector<CalleeSavedInfo> &CSI,
1854+ unsigned &MinCSFrameIndex,
1855+ unsigned &MaxCSFrameIndex) {
1856+ SIMachineFunctionInfo *FuncInfo = MF.getInfo <SIMachineFunctionInfo>();
1857+ MachineFrameInfo &MFI = MF.getFrameInfo ();
1858+ const SIInstrInfo *TII = ST.getInstrInfo ();
1859+ const SIRegisterInfo *MRI = ST.getRegisterInfo ();
1860+
1861+ assert (std::is_sorted (CSI.begin (), CSI.end (),
1862+ [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1863+ return A.getReg () < B.getReg ();
1864+ }) &&
1865+ " Callee saved registers not sorted" );
1866+
1867+ auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1868+ return !CSI.isSpilledToReg () &&
1869+ MRI->isVGPR (MF.getRegInfo (), CSI.getReg ()) &&
1870+ !FuncInfo->isWWMReservedRegister (CSI.getReg ());
1871+ };
1872+
1873+ auto CSEnd = CSI.end ();
1874+ for (auto CSIt = CSI.begin (); CSIt != CSEnd; ++CSIt) {
1875+ Register Reg = CSIt->getReg ();
1876+ if (!CanUseBlockOps (*CSIt))
1877+ continue ;
1878+
1879+ // Find all the regs that will fit in a 32-bit block starting at the current
1880+ // reg and build the mask. It should have 1 for every register that's
1881+ // included, with the current register as the least significant bit.
1882+ uint32_t Mask = 1 ;
1883+ CSEnd = std::remove_if (
1884+ CSIt + 1 , CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1885+ if (CanUseBlockOps (CSI) && CSI.getReg () < Reg + 32 ) {
1886+ Mask |= 1 << (CSI.getReg () - Reg);
1887+ return true ;
1888+ } else {
1889+ return false ;
1890+ }
1891+ });
1892+
1893+ const TargetRegisterClass *BlockRegClass =
1894+ TII->getRegClassForBlockOp (TRI, MF);
1895+ Register RegBlock =
1896+ MRI->getMatchingSuperReg (Reg, AMDGPU::sub0, BlockRegClass);
1897+ if (!RegBlock) {
1898+ // We couldn't find a super register for the block. This can happen if
1899+ // the register we started with is too high (e.g. v232 if the maximum is
1900+ // v255). We therefore try to get the last register block and figure out
1901+ // the mask from there.
1902+ Register LastBlockStart =
1903+ AMDGPU::VGPR0 + alignDown (Reg - AMDGPU::VGPR0, 32 );
1904+ RegBlock =
1905+ MRI->getMatchingSuperReg (LastBlockStart, AMDGPU::sub0, BlockRegClass);
1906+ assert (RegBlock && MRI->isSubRegister (RegBlock, Reg) &&
1907+ " Couldn't find super register" );
1908+ int RegDelta = Reg - LastBlockStart;
1909+ assert (RegDelta > 0 && llvm::countl_zero (Mask) >= RegDelta &&
1910+ " Bad shift amount" );
1911+ Mask <<= RegDelta;
1912+ }
1913+
1914+ FuncInfo->setMaskForVGPRBlockOps (RegBlock, Mask);
1915+
1916+ // The stack objects can be a bit smaller than the register block if we know
1917+ // some of the high bits of Mask are 0. This may happen often with calling
1918+ // conventions where the caller and callee-saved VGPRs are interleaved at
1919+ // a small boundary (e.g. 8 or 16).
1920+ int UnusedBits = llvm::countl_zero (Mask);
1921+ unsigned BlockSize = MRI->getSpillSize (*BlockRegClass) - UnusedBits * 4 ;
1922+ int FrameIdx =
1923+ MFI.CreateStackObject (BlockSize, MRI->getSpillAlign (*BlockRegClass),
1924+ /* isSpillSlot=*/ true );
1925+ if ((unsigned )FrameIdx < MinCSFrameIndex)
1926+ MinCSFrameIndex = FrameIdx;
1927+ if ((unsigned )FrameIdx > MaxCSFrameIndex)
1928+ MaxCSFrameIndex = FrameIdx;
1929+
1930+ CSIt->setFrameIdx (FrameIdx);
1931+ CSIt->setReg (RegBlock);
1932+ CSIt->setHandledByTarget ();
1933+ }
1934+ CSI.erase (CSEnd, CSI.end ());
1935+ }
1936+
1937+ bool SIFrameLowering::assignCalleeSavedSpillSlots (
1938+ MachineFunction &MF, const TargetRegisterInfo *TRI,
1939+ std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1940+ unsigned &MaxCSFrameIndex) const {
1941+ if (CSI.empty ())
1942+ return true ; // Early exit if no callee saved registers are modified!
1943+
1944+ const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
1945+ bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR ();
1946+
1947+ if (UseVGPRBlocks)
1948+ assignSlotsUsingVGPRBlocks (MF, ST, TRI, CSI, MinCSFrameIndex,
1949+ MaxCSFrameIndex);
1950+
1951+ return assignCalleeSavedSpillSlots (MF, TRI, CSI);
1952+ }
1953+
18501954bool SIFrameLowering::assignCalleeSavedSpillSlots (
18511955 MachineFunction &MF, const TargetRegisterInfo *TRI,
18521956 std::vector<CalleeSavedInfo> &CSI) const {
@@ -1915,6 +2019,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
19152019 return true ;
19162020}
19172021
2022+ bool SIFrameLowering::spillCalleeSavedRegisters (
2023+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2024+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2025+ MachineFunction *MF = MBB.getParent ();
2026+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
2027+ if (!ST.useVGPRBlockOpsForCSR ())
2028+ return false ;
2029+
2030+ MachineFrameInfo &FrameInfo = MF->getFrameInfo ();
2031+ SIMachineFunctionInfo *MFI = MF->getInfo <SIMachineFunctionInfo>();
2032+ const SIInstrInfo *TII = ST.getInstrInfo ();
2033+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
2034+
2035+ for (const CalleeSavedInfo &CS : CSI) {
2036+ Register Reg = CS.getReg ();
2037+ if (!CS.isHandledByTarget ())
2038+ continue ;
2039+
2040+ // Build a scratch block store.
2041+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
2042+ int FrameIndex = CS.getFrameIdx ();
2043+ MachinePointerInfo PtrInfo =
2044+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
2045+ MachineMemOperand *MMO =
2046+ MF->getMachineMemOperand (PtrInfo, MachineMemOperand::MOStore,
2047+ FrameInfo.getObjectSize (FrameIndex),
2048+ FrameInfo.getObjectAlign (FrameIndex));
2049+
2050+ BuildMI (MBB, MI, MI->getDebugLoc (),
2051+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
2052+ .addReg (Reg, getKillRegState (false ))
2053+ .addFrameIndex (FrameIndex)
2054+ .addReg (MFI->getStackPtrOffsetReg ())
2055+ .addImm (0 )
2056+ .addImm (Mask)
2057+ .addMemOperand (MMO);
2058+
2059+ FuncInfo->setHasSpilledVGPRs ();
2060+
2061+ // Add the register to the liveins. This is necessary because if any of the
2062+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2063+ // then the whole block will be marked as reserved and `updateLiveness` will
2064+ // skip it.
2065+ MBB.addLiveIn (Reg);
2066+ }
2067+
2068+ return false ;
2069+ }
2070+
2071+ bool SIFrameLowering::restoreCalleeSavedRegisters (
2072+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2073+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2074+ MachineFunction *MF = MBB.getParent ();
2075+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
2076+ if (!ST.useVGPRBlockOpsForCSR ())
2077+ return false ;
2078+
2079+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
2080+ MachineFrameInfo &MFI = MF->getFrameInfo ();
2081+ const SIInstrInfo *TII = ST.getInstrInfo ();
2082+ const SIRegisterInfo *SITRI = static_cast <const SIRegisterInfo *>(TRI);
2083+ for (const CalleeSavedInfo &CS : reverse (CSI)) {
2084+ if (!CS.isHandledByTarget ())
2085+ continue ;
2086+
2087+ // Build a scratch block load.
2088+ Register Reg = CS.getReg ();
2089+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
2090+ int FrameIndex = CS.getFrameIdx ();
2091+ MachinePointerInfo PtrInfo =
2092+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
2093+ MachineMemOperand *MMO = MF->getMachineMemOperand (
2094+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize (FrameIndex),
2095+ MFI.getObjectAlign (FrameIndex));
2096+
2097+ auto MIB = BuildMI (MBB, MI, MI->getDebugLoc (),
2098+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
2099+ .addFrameIndex (FrameIndex)
2100+ .addReg (FuncInfo->getStackPtrOffsetReg ())
2101+ .addImm (0 )
2102+ .addImm (Mask)
2103+ .addMemOperand (MMO);
2104+ SITRI->addImplicitUsesForBlockCSRLoad (MIB, Reg);
2105+
2106+ // Add the register to the liveins. This is necessary because if any of the
2107+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2108+ // then the whole block will be marked as reserved and `updateLiveness` will
2109+ // skip it.
2110+ if (!MBB.isLiveIn (Reg))
2111+ MBB.addLiveIn (Reg);
2112+ }
2113+
2114+ return false ;
2115+ }
2116+
19182117MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr (
19192118 MachineFunction &MF,
19202119 MachineBasicBlock &MBB,
0 commit comments