Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a525bba
Add subtarget feature
rovka Jan 24, 2025
1ceab6a
[AMDGPU] ISel & PEI for whole wave functions
rovka Jan 27, 2025
399e08c
Use MF instead of MBB
rovka Mar 17, 2025
8f72b59
Revert "Add subtarget feature"
rovka Mar 11, 2025
accbe8e
Add new CC. Do nothing
rovka Mar 19, 2025
1a82d88
Replace SubtargetFeature with CallingConv
rovka Mar 11, 2025
ea3821b
Enable gisel in tests
rovka Mar 17, 2025
1b20edd
GISel support
rovka Mar 11, 2025
5e97750
Rename pseudo to match others
rovka Mar 19, 2025
be094ce
Rename CC
rovka Mar 25, 2025
b1a17c6
Fix formatting
rovka Mar 25, 2025
75017e9
Merge branch 'main' into whole-wave-funcs
rovka Apr 7, 2025
4c6beec
Merge remote-tracking branch 'remotes/origin/main' into whole-wave-funcs
rovka Apr 30, 2025
80e6433
Update tests after merge
rovka May 6, 2025
552e220
Fix bug in testcase
rovka May 6, 2025
7ed7e96
Test inreg args
rovka May 19, 2025
8325ef1
Merge remote-tracking branch 'remotes/origin/main' into whole-wave-funcs
rovka May 20, 2025
e1f133e
Add docs and fixme
rovka May 20, 2025
ac70a87
Remove kill flags on orig exec mask
rovka Jun 17, 2025
08102a3
Add helper to add orig exec to return
rovka Jun 23, 2025
1cd402f
Test with single use of orig exec
rovka Jun 23, 2025
e8fc4bd
Test calling gfx func from wwf
rovka Jun 23, 2025
8feed10
Test wave64
rovka Jun 24, 2025
bc7b9ef
Merge remote-tracking branch 'remotes/origin/main' into whole-wave-funcs
rovka Jun 24, 2025
ba08290
Merge remote-tracking branch 'remotes/origin/main' into whole-wave-funcs
rovka Jun 24, 2025
bc8d8ce
Fix a few missed spots
rovka Jun 24, 2025
0eb6c66
clang-format
rovka Jun 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1843,6 +1843,20 @@ The AMDGPU backend supports the following calling conventions:
..TODO::
Describe.

``amdgpu_gfx_whole_wave`` Used for AMD graphics targets. Functions with this calling convention
cannot be used as entry points. They must have an i1 as the first argument,
which will be mapped to the value of EXEC on entry into the function. Other
arguments will contain poison in their inactive lanes. Similarly, the return
value for the inactive lanes is poison.

The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
prologue and restored to its original value in the epilogue. The inactive lanes
will be preserved for all the registers used by the function. Active lanes only
will only be preserved for the callee saved registers.

In all other respects, functions with this calling convention behave like
``amdgpu_gfx`` functions.

``amdgpu_gs`` Used for Mesa/AMDPAL geometry shaders.
..TODO::
Describe.
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/AsmParser/LLToken.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ enum Kind {
kw_amdgpu_cs_chain_preserve,
kw_amdgpu_kernel,
kw_amdgpu_gfx,
kw_amdgpu_gfx_whole_wave,
kw_tailcc,
kw_m68k_rtdcc,
kw_graalcc,
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/CallingConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,9 @@ namespace CallingConv {
RISCV_VLSCall_32768 = 122,
RISCV_VLSCall_65536 = 123,

// Calling convention for AMDGPU whole wave functions.
AMDGPU_Gfx_WholeWave = 124,

/// The highest possible ID. Must be some 2^k - 1.
MaxID = 1023
};
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/AsmParser/LLLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_cs_chain_preserve);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
KEYWORD(amdgpu_gfx_whole_wave);
KEYWORD(tailcc);
KEYWORD(m68k_rtdcc);
KEYWORD(graalcc);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/AsmParser/LLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2274,6 +2274,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
CC = CallingConv::AMDGPU_CS_ChainPreserve;
break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
case lltok::kw_amdgpu_gfx_whole_wave:
CC = CallingConv::AMDGPU_Gfx_WholeWave;
break;
case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break;
case lltok::kw_graalcc: CC = CallingConv::GRAAL; break;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/IR/AsmWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
case CallingConv::AMDGPU_Gfx_WholeWave:
Out << "amdgpu_gfx_whole_wave";
break;
case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break;
case CallingConv::RISCV_VectorCall:
Out << "riscv_vector_cc";
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/IR/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::WASM_EmscriptenInvoke:
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
case CallingConv::M68k_INTR:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
Expand Down
33 changes: 30 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,15 +374,21 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}

unsigned ReturnOpc =
IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
const bool IsWholeWave = MFI->isWholeWaveFunction();
unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
: IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
: AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);

if (!FLI.CanLowerReturn)
insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;

if (IsWholeWave) {
addOriginalExecToReturn(B.getMF(), Ret);
}

// TODO: Handle CalleeSavedRegsViaCopy.

B.insertInstr(Ret);
Expand Down Expand Up @@ -632,6 +638,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;

if (Info->isWholeWaveFunction() && Idx == 0) {
assert(VRegs[Idx].size() == 1 && "Expected only one register");

// The first argument for whole wave functions is the original EXEC value.
B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
.addDef(VRegs[Idx][0]);

++Idx;
continue;
}

const bool InReg = Arg.hasAttribute(Attribute::InReg);

if (Arg.hasAttribute(Attribute::SwiftSelf) ||
Expand Down Expand Up @@ -1347,6 +1364,7 @@ bool AMDGPUCallLowering::lowerTailCall(
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
Expand Down Expand Up @@ -1524,7 +1542,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
Expand Down Expand Up @@ -1592,3 +1611,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

return true;
}

void AMDGPUCallLowering::addOriginalExecToReturn(
MachineFunction &MF, MachineInstrBuilder &Ret) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
Ret.addReg(Setup->getOperand(0).getReg());
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;

void addOriginalExecToReturn(MachineFunction &MF,
MachineInstrBuilder &Ret) const;

public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;

def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.

class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
Expand All @@ -1163,6 +1164,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
Expand Down Expand Up @@ -5716,6 +5718,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
NODE_NAME_CASE(WHOLE_WAVE_SETUP)
NODE_NAME_CASE(WHOLE_WAVE_RETURN)
}
return nullptr;
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,12 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,

// Set up a whole wave function.
WHOLE_WAVE_SETUP,

// Return from a whole wave function.
WHOLE_WAVE_RETURN,
};

} // End namespace AMDGPUISD
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",

def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

// Marks the entry into a whole wave function.
def AMDGPUwhole_wave_setup : SDNode<
"AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPSideEffect]>;

// Marks the return from a whole wave function.
def AMDGPUwhole_wave_return : SDNode<
"AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4141,6 +4141,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
return true;
}
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5456,6 +5456,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
break;
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
}

return getInstructionMapping(/*ID*/1, /*Cost*/1,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3155,7 +3155,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
if (CC != CallingConv::AMDGPU_Gfx)
if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);

auto NextMI = std::next(It);
Expand Down
87 changes: 77 additions & 10 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,

initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);

ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveUnits, *TRI.getWaveMaskRegClass());
if (FuncInfo->isWholeWaveFunction()) {
// Whole wave functions already have a copy of the original EXEC mask that
// we can use.
assert(IsProlog && "Epilog should look at return, not setup");
ScratchExecCopy =
TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
assert(ScratchExecCopy && "Couldn't find copy of EXEC");
} else {
ScratchExecCopy = findScratchNonCalleeSaveRegister(
MRI, LiveUnits, *TRI.getWaveMaskRegClass());
}

if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");

Expand Down Expand Up @@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
};

StoreWWMRegisters(WWMScratchRegs);

auto EnableAllLanes = [&]() {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
};

if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
EnableAllLanes();
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
Expand All @@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
}

StoreWWMRegisters(WWMCalleeSavedRegs);
if (ScratchExecCopy) {
if (FuncInfo->isWholeWaveFunction()) {
// SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
// it now. If we have already saved some WWM CSR registers, then the EXEC is
// already -1 and we don't need to do anything else. Otherwise, set EXEC to
// -1 here.
if (!ScratchExecCopy)
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
/*EnableInactiveLanes*/ true);
else if (WWMCalleeSavedRegs.empty())
EnableAllLanes();
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
} else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
Expand Down Expand Up @@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
Register ScratchExecCopy;
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
if (!WWMScratchRegs.empty())
ScratchExecCopy =
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false, /*EnableInactiveLanes*/ true);

auto RestoreWWMRegisters =
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
for (const auto &Reg : WWMRegs) {
Expand All @@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
}
};

if (FuncInfo->isWholeWaveFunction()) {
// For whole wave functions, the EXEC is already -1 at this point.
// Therefore, we can restore the CSR WWM registers right away.
RestoreWWMRegisters(WWMCalleeSavedRegs);

// The original EXEC is the first operand of the return instruction.
const MachineInstr &Return = MBB.instr_back();
assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
"Unexpected return inst");
Register OrigExec = Return.getOperand(0).getReg();

if (!WWMScratchRegs.empty()) {
unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
.addReg(OrigExec)
.addImm(-1);
RestoreWWMRegisters(WWMScratchRegs);
}

// Restore original EXEC.
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
return;
}

if (!WWMScratchRegs.empty())
ScratchExecCopy =
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false, /*EnableInactiveLanes*/ true);

RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
Expand Down Expand Up @@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
(MFI->isChainFunction() &&
TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
Expand Down Expand Up @@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isEntryFunction())
return;

if (MFI->isWholeWaveFunction()) {
// In practice, all the VGPRs are WWM registers, and we will need to save at
// least their inactive lanes. Add them to WWMReservedRegs.
assert(!NeedExecCopyReservedReg &&
"Whole wave functions can use the reg mapped for their i1 argument");

// FIXME: Be more efficient!
for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to be expensive.
It's probably fine for making this work, but long term I think we'd need to do this differently.

if (MF.getRegInfo().isPhysRegModified(Reg)) {
MFI->reserveWWMRegister(Reg);
MF.begin()->addLiveIn(Reg);
}
MF.begin()->sortUniqueLiveIns();
}

// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
Expand Down
Loading
Loading