Skip to content
This repository was archived by the owner on Sep 15, 2025. It is now read-only.

Commit 833fdac

Browse files
committed
Update [AMDGPU] Dynamic VGPR support for llvm.amdgcn.cs.chain llvm#130094
1 parent cbadf87 commit 833fdac

File tree

7 files changed

+47
-31
lines changed

7 files changed

+47
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,15 +1246,15 @@ bool AMDGPUCallLowering::lowerTailCall(
12461246
// On GFX12, we can only change the VGPR allocation for wave32.
12471247
if (!ST.isWave32()) {
12481248
F.getContext().diagnose(DiagnosticInfoUnsupported(
1249-
F, "Dynamic VGPR mode is only supported for wave32\n"));
1249+
F, "dynamic VGPR mode is only supported for wave32"));
12501250
return false;
12511251
}
12521252

12531253
ArgInfo FallbackExecArg = Info.OrigArgs[ChainCallArgIdx::FallbackExec];
12541254
assert(FallbackExecArg.Regs.size() == 1 &&
12551255
"Expected single register for fallback EXEC");
12561256
if (!FallbackExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) {
1257-
LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC");
1257+
LLVM_DEBUG(dbgs() << "Bad type for fallback EXEC\n");
12581258
return false;
12591259
}
12601260
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5475,6 +5475,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54755475
NODE_NAME_CASE(TC_RETURN)
54765476
NODE_NAME_CASE(TC_RETURN_GFX)
54775477
NODE_NAME_CASE(TC_RETURN_CHAIN)
5478+
NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
54785479
NODE_NAME_CASE(TRAP)
54795480
NODE_NAME_CASE(RET_GLUE)
54805481
NODE_NAME_CASE(WAVE_ADDRESS)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,7 @@ enum NodeType : unsigned {
402402
TC_RETURN,
403403
TC_RETURN_GFX,
404404
TC_RETURN_CHAIN,
405+
TC_RETURN_CHAIN_DVGPR,
405406
TRAP,
406407

407408
// Masked control flow nodes.

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,12 @@ def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
9999
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
100100
>;
101101

102+
// With dynamic VGPRs.
103+
def AMDGPUtc_return_chain_dvgpr: SDNode<"AMDGPUISD::TC_RETURN_CHAIN_DVGPR",
104+
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
105+
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
106+
>;
107+
102108
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
103109
SDTypeProfile<0, 1, [SDTCisVT<0, i16>]>,
104110
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPOptInGlue]

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3710,6 +3710,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37103710
SDValue Callee = CLI.Callee;
37113711

37123712
llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3713+
bool UsesDynamicVGPRs = false;
37133714
if (IsChainCallConv) {
37143715
// The last arguments should be the value that we need to put in EXEC,
37153716
// followed by the flags and any other arguments with special meanings.
@@ -3758,6 +3759,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37583759
return lowerUnhandledCall(CLI, InVals, "Expected 3 additional args");
37593760
}
37603761

3762+
UsesDynamicVGPRs = true;
37613763
std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
37623764
CLI.Args.end(), PushNodeOrTargetConstant);
37633765
}
@@ -4091,7 +4093,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
40914093
break;
40924094
case CallingConv::AMDGPU_CS_Chain:
40934095
case CallingConv::AMDGPU_CS_ChainPreserve:
4094-
OPC = AMDGPUISD::TC_RETURN_CHAIN;
4096+
OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4097+
: AMDGPUISD::TC_RETURN_CHAIN;
40954098
break;
40964099
}
40974100

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -704,16 +704,16 @@ multiclass SI_CS_CHAIN_TC<
704704
SchedRW = [WriteBranch],
705705
isConvergent = 1,
706706
WaveSizePredicate = wavesizepred in {
707-
// This is essentially a tail call, but it also takes a mask to put in EXEC
708-
// right before jumping to the callee.
709-
def NAME: SPseudoInstSI <(outs),
710-
(ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
711-
712-
// Same as above, but it will first try to reallocate the VGPRs, and choose an
713-
// EXEC mask and a callee depending on the success of the reallocation attempt.
714-
def _DVGPR : SPseudoInstSI <(outs),
715-
(ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
716-
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
707+
// This is essentially a tail call, but it also takes a mask to put in EXEC
708+
// right before jumping to the callee.
709+
def NAME: SPseudoInstSI <(outs),
710+
(ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
711+
712+
// Same as above, but it will first try to reallocate the VGPRs, and choose an
713+
// EXEC mask and a callee depending on the success of the reallocation attempt.
714+
def _DVGPR : SPseudoInstSI <(outs),
715+
(ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
716+
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
717717
} // End FixedSize = 0 etc
718718
}
719719

@@ -747,16 +747,15 @@ defm : si_cs_chain_tc_patterns<i64>;
747747
multiclass si_cs_chain_tc_dvgpr_patterns<
748748
ValueType execvt, RegisterOperand execrc = getSOPSrcForVT<execvt>.ret,
749749
Instruction tc = SI_CS_CHAIN_TC_W32_DVGPR> {
750-
let AddedComplexity = 90 in {
751750
foreach callee = [ (i64 0), (tglobaladdr) ] in {
752751
def : GCNPat<
753-
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec,
754-
i32:$numvgprs, execvt:$fbexec, i64:$fbcallee),
752+
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
753+
execvt:$exec, i32:$numvgprs,
754+
execvt:$fbexec, i64:$fbcallee),
755755
(tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
756756
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
757757
>;
758758
}
759-
} // AddedComplexity
760759
}
761760

762761
defm : si_cs_chain_tc_dvgpr_patterns<i32>; // On GFX12, dVGPR mode is wave32-only.

llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,12 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
117117
MDT->applyUpdates(DTUpdates);
118118
}
119119

120-
static void addRegOrCopyOp(MachineInstrBuilder &MIB, MachineOperand &Op) {
120+
static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
121+
MachineOperand &Op) {
121122
if (Op.isReg())
122123
MIB.addReg(Op.getReg());
123124
else
124-
MIB->addOperand(Op);
125+
MIB.add(Op);
125126
}
126127

127128
void SILateBranchLowering::expandChainCall(MachineInstr &MI,
@@ -138,30 +139,35 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
138139
// * Try to change the VGPR allocation
139140
// * Select the callee based on the result of the reallocation attempt
140141
// * Select the EXEC mask based on the result of the reallocation attempt
142+
// If any of the register operands of the chain pseudo is used in more than
143+
// one of these instructions, we need to make sure that the kill flags
144+
// aren't copied along.
141145
auto AllocMI =
142146
BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_ALLOC_VGPR));
143-
addRegOrCopyOp(AllocMI,
144-
*TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs));
147+
copyOpWithoutRegFlags(AllocMI,
148+
*TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs));
145149

146150
auto SelectCallee =
147151
BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_CSELECT_B64))
148152
.addDef(TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg());
149-
addRegOrCopyOp(SelectCallee,
150-
*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
151-
addRegOrCopyOp(SelectCallee,
152-
*TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
153+
copyOpWithoutRegFlags(SelectCallee,
154+
*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
155+
copyOpWithoutRegFlags(SelectCallee,
156+
*TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
153157

154158
auto SelectExec = BuildMI(*MI.getParent(), MI, DL,
155159
TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
156160
: AMDGPU::S_CSELECT_B64))
157161
.addDef(ExecReg);
158162

159-
addRegOrCopyOp(SelectExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
160-
addRegOrCopyOp(SelectExec,
161-
*TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
163+
copyOpWithoutRegFlags(SelectExec,
164+
*TII->getNamedOperand(MI, AMDGPU::OpName::exec));
165+
copyOpWithoutRegFlags(SelectExec,
166+
*TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
162167
} else {
163168
auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg);
164-
addRegOrCopyOp(SetExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
169+
copyOpWithoutRegFlags(SetExec,
170+
*TII->getNamedOperand(MI, AMDGPU::OpName::exec));
165171
}
166172

167173
for (unsigned OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx;
@@ -215,12 +221,12 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
215221

216222
case AMDGPU::SI_CS_CHAIN_TC_W32:
217223
case AMDGPU::SI_CS_CHAIN_TC_W64:
218-
expandChainCall(MI, ST, /*DynamicVGPR=*/ false);
224+
expandChainCall(MI, ST, /*DynamicVGPR=*/false);
219225
MadeChange = true;
220226
break;
221227
case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR:
222228
case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR:
223-
expandChainCall(MI, ST, /*DynamicVGPR=*/ true);
229+
expandChainCall(MI, ST, /*DynamicVGPR=*/true);
224230
MadeChange = true;
225231
break;
226232

0 commit comments

Comments
 (0)