@@ -1694,6 +1694,110 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
16941694 }
16951695}
16961696
1697+ static void assignSlotsUsingVGPRBlocks (MachineFunction &MF,
1698+ const GCNSubtarget &ST,
1699+ const TargetRegisterInfo *TRI,
1700+ std::vector<CalleeSavedInfo> &CSI,
1701+ unsigned &MinCSFrameIndex,
1702+ unsigned &MaxCSFrameIndex) {
1703+ SIMachineFunctionInfo *FuncInfo = MF.getInfo <SIMachineFunctionInfo>();
1704+ MachineFrameInfo &MFI = MF.getFrameInfo ();
1705+ const SIInstrInfo *TII = ST.getInstrInfo ();
1706+ const SIRegisterInfo *MRI = ST.getRegisterInfo ();
1707+
1708+ assert (std::is_sorted (CSI.begin (), CSI.end (),
1709+ [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1710+ return A.getReg () < B.getReg ();
1711+ }) &&
1712+ " Callee saved registers not sorted" );
1713+
1714+ auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1715+ return !CSI.isSpilledToReg () &&
1716+ MRI->isVGPR (MF.getRegInfo (), CSI.getReg ()) &&
1717+ !FuncInfo->isWWMReservedRegister (CSI.getReg ());
1718+ };
1719+
1720+ auto CSEnd = CSI.end ();
1721+ for (auto CSIt = CSI.begin (); CSIt != CSEnd; ++CSIt) {
1722+ Register Reg = CSIt->getReg ();
1723+ if (!CanUseBlockOps (*CSIt))
1724+ continue ;
1725+
1726+ // Find all the regs that will fit in a 32-bit block starting at the current
1727+ // reg and build the mask. It should have 1 for every register that's
1728+ // included, with the current register as the least significant bit.
1729+ uint32_t Mask = 1 ;
1730+ CSEnd = std::remove_if (
1731+ CSIt + 1 , CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1732+ if (CanUseBlockOps (CSI) && CSI.getReg () < Reg + 32 ) {
1733+ Mask |= 1 << (CSI.getReg () - Reg);
1734+ return true ;
1735+ } else {
1736+ return false ;
1737+ }
1738+ });
1739+
1740+ const TargetRegisterClass *BlockRegClass =
1741+ TII->getRegClassForBlockOp (TRI, MF);
1742+ Register RegBlock =
1743+ MRI->getMatchingSuperReg (Reg, AMDGPU::sub0, BlockRegClass);
1744+ if (!RegBlock) {
1745+ // We couldn't find a super register for the block. This can happen if
1746+ // the register we started with is too high (e.g. v232 if the maximum is
1747+ // v255). We therefore try to get the last register block and figure out
1748+ // the mask from there.
1749+ Register LastBlockStart =
1750+ AMDGPU::VGPR0 + alignDown (Reg - AMDGPU::VGPR0, 32 );
1751+ RegBlock =
1752+ MRI->getMatchingSuperReg (LastBlockStart, AMDGPU::sub0, BlockRegClass);
1753+ assert (RegBlock && MRI->isSubRegister (RegBlock, Reg) &&
1754+ " Couldn't find super register" );
1755+ int RegDelta = Reg - LastBlockStart;
1756+ assert (RegDelta > 0 && llvm::countl_zero (Mask) >= RegDelta &&
1757+ " Bad shift amount" );
1758+ Mask <<= RegDelta;
1759+ }
1760+
1761+ FuncInfo->setMaskForVGPRBlockOps (RegBlock, Mask);
1762+
1763+ // The stack objects can be a bit smaller than the register block if we know
1764+ // some of the high bits of Mask are 0. This may happen often with calling
1765+ // conventions where the caller and callee-saved VGPRs are interleaved at
1766+ // a small boundary (e.g. 8 or 16).
1767+ int UnusedBits = llvm::countl_zero (Mask);
1768+ unsigned BlockSize = MRI->getSpillSize (*BlockRegClass) - UnusedBits * 4 ;
1769+ int FrameIdx =
1770+ MFI.CreateStackObject (BlockSize, MRI->getSpillAlign (*BlockRegClass),
1771+ /* isSpillSlot=*/ true );
1772+ if ((unsigned )FrameIdx < MinCSFrameIndex)
1773+ MinCSFrameIndex = FrameIdx;
1774+ if ((unsigned )FrameIdx > MaxCSFrameIndex)
1775+ MaxCSFrameIndex = FrameIdx;
1776+
1777+ CSIt->setFrameIdx (FrameIdx);
1778+ CSIt->setReg (RegBlock);
1779+ CSIt->setHandledByTarget ();
1780+ }
1781+ CSI.erase (CSEnd, CSI.end ());
1782+ }
1783+
1784+ bool SIFrameLowering::assignCalleeSavedSpillSlots (
1785+ MachineFunction &MF, const TargetRegisterInfo *TRI,
1786+ std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1787+ unsigned &MaxCSFrameIndex) const {
1788+ if (CSI.empty ())
1789+ return true ; // Early exit if no callee saved registers are modified!
1790+
1791+ const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
1792+ bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR ();
1793+
1794+ if (UseVGPRBlocks)
1795+ assignSlotsUsingVGPRBlocks (MF, ST, TRI, CSI, MinCSFrameIndex,
1796+ MaxCSFrameIndex);
1797+
1798+ return assignCalleeSavedSpillSlots (MF, TRI, CSI);
1799+ }
1800+
16971801bool SIFrameLowering::assignCalleeSavedSpillSlots (
16981802 MachineFunction &MF, const TargetRegisterInfo *TRI,
16991803 std::vector<CalleeSavedInfo> &CSI) const {
@@ -1763,6 +1867,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
17631867 return true ;
17641868}
17651869
1870+ bool SIFrameLowering::spillCalleeSavedRegisters (
1871+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1872+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1873+ MachineFunction *MF = MBB.getParent ();
1874+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
1875+ if (!ST.useVGPRBlockOpsForCSR ())
1876+ return false ;
1877+
1878+ MachineFrameInfo &FrameInfo = MF->getFrameInfo ();
1879+ SIMachineFunctionInfo *MFI = MF->getInfo <SIMachineFunctionInfo>();
1880+ const SIInstrInfo *TII = ST.getInstrInfo ();
1881+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
1882+
1883+ for (const CalleeSavedInfo &CS : CSI) {
1884+ Register Reg = CS.getReg ();
1885+ if (!CS.isHandledByTarget ())
1886+ continue ;
1887+
1888+ // Build a scratch block store.
1889+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
1890+ int FrameIndex = CS.getFrameIdx ();
1891+ MachinePointerInfo PtrInfo =
1892+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
1893+ MachineMemOperand *MMO =
1894+ MF->getMachineMemOperand (PtrInfo, MachineMemOperand::MOStore,
1895+ FrameInfo.getObjectSize (FrameIndex),
1896+ FrameInfo.getObjectAlign (FrameIndex));
1897+
1898+ BuildMI (MBB, MI, MI->getDebugLoc (),
1899+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
1900+ .addReg (Reg, getKillRegState (false ))
1901+ .addFrameIndex (FrameIndex)
1902+ .addReg (MFI->getStackPtrOffsetReg ())
1903+ .addImm (0 )
1904+ .addImm (Mask)
1905+ .addMemOperand (MMO);
1906+
1907+ FuncInfo->setHasSpilledVGPRs ();
1908+
1909+ // Add the register to the liveins. This is necessary because if any of the
1910+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
1911+ // then the whole block will be marked as reserved and `updateLiveness` will
1912+ // skip it.
1913+ MBB.addLiveIn (Reg);
1914+ }
1915+
1916+ return false ;
1917+ }
1918+
1919+ bool SIFrameLowering::restoreCalleeSavedRegisters (
1920+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1921+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1922+ MachineFunction *MF = MBB.getParent ();
1923+ const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
1924+ if (!ST.useVGPRBlockOpsForCSR ())
1925+ return false ;
1926+
1927+ SIMachineFunctionInfo *FuncInfo = MF->getInfo <SIMachineFunctionInfo>();
1928+ MachineFrameInfo &MFI = MF->getFrameInfo ();
1929+ const SIInstrInfo *TII = ST.getInstrInfo ();
1930+ const SIRegisterInfo *SITRI = static_cast <const SIRegisterInfo *>(TRI);
1931+ for (const CalleeSavedInfo &CS : reverse (CSI)) {
1932+ if (!CS.isHandledByTarget ())
1933+ continue ;
1934+
1935+ // Build a scratch block load.
1936+ Register Reg = CS.getReg ();
1937+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps (Reg);
1938+ int FrameIndex = CS.getFrameIdx ();
1939+ MachinePointerInfo PtrInfo =
1940+ MachinePointerInfo::getFixedStack (*MF, FrameIndex);
1941+ MachineMemOperand *MMO = MF->getMachineMemOperand (
1942+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize (FrameIndex),
1943+ MFI.getObjectAlign (FrameIndex));
1944+
1945+ auto MIB = BuildMI (MBB, MI, MI->getDebugLoc (),
1946+ TII->get (AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
1947+ .addFrameIndex (FrameIndex)
1948+ .addReg (FuncInfo->getStackPtrOffsetReg ())
1949+ .addImm (0 )
1950+ .addImm (Mask)
1951+ .addMemOperand (MMO);
1952+ SITRI->addImplicitUsesForBlockCSRLoad (MIB, Reg);
1953+
1954+ // Add the register to the liveins. This is necessary because if any of the
1955+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
1956+ // then the whole block will be marked as reserved and `updateLiveness` will
1957+ // skip it.
1958+ if (!MBB.isLiveIn (Reg))
1959+ MBB.addLiveIn (Reg);
1960+ }
1961+
1962+ return false ;
1963+ }
1964+
17661965MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr (
17671966 MachineFunction &MF,
17681967 MachineBasicBlock &MBB,
0 commit comments