Skip to content

Commit 4636848

Browse files
rovkakcloudy0717
authored andcommitted
[AMDGPU] Allow any SGPRs for chain callees (llvm#168345)
Chain calls never return and don't need to preserve any SGPRs. Therefore, we don't need to limit the registers used for callees to the CCR_SGPR_64 register class - it's fine to use any available SGPRs. Also introduce a new pseudo, SI_TCRETURN_CHAIN, which also has a plain SGPR_64 operand. This is necessary because we won't be able to lower SI_CS_CHAIN_TC to SI_TCRETURN anymore, since its operand accepts a wider range of registers than the latter.
1 parent eff9742 commit 4636848

9 files changed

+146
-99
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
229229
OutMI.addOperand(Src);
230230
return;
231231
} else if (Opcode == AMDGPU::SI_TCRETURN ||
232-
Opcode == AMDGPU::SI_TCRETURN_GFX) {
232+
Opcode == AMDGPU::SI_TCRETURN_GFX ||
233+
Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
233234
// TODO: How to use branch immediate and avoid register+add?
234235
Opcode = AMDGPU::S_SETPC_B64;
235236
} else if (AMDGPU::getT16D16Helper(Opcode)) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -820,9 +820,8 @@ def SI_CALL : SPseudoInstSI <
820820
let isConvergent = 1;
821821
}
822822

823-
class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
824-
(ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
825-
[(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
823+
class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
824+
: SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
826825
let Size = 4;
827826
let FixedSize = 1;
828827
let isCall = 1;
@@ -836,8 +835,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
836835
}
837836

838837
// Tail call handling pseudo
839-
def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
840-
def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
838+
def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
839+
[(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
840+
def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
841+
[(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
842+
843+
// Tail call for chain calling conventions.
844+
// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
845+
// never return and don't need to preserve any SGPRs.
846+
def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;
841847

842848
// Handle selecting indirect tail calls
843849
def : GCNPat<
@@ -867,13 +873,13 @@ multiclass SI_CS_CHAIN_TC<
867873
// This is essentially a tail call, but it also takes a mask to put in EXEC
868874
// right before jumping to the callee.
869875
def NAME: SPseudoInstSI <(outs),
870-
(ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
876+
(ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
871877

872878
// Same as above, but it will first try to reallocate the VGPRs, and choose an
873879
// EXEC mask and a callee depending on the success of the reallocation attempt.
874880
def _DVGPR : SPseudoInstSI <(outs),
875-
(ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
876-
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
881+
(ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
882+
SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
877883
} // End FixedSize = 0 etc
878884
}
879885

@@ -885,7 +891,7 @@ multiclass si_cs_chain_tc_pattern<
885891
dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
886892
def : GCNPat<
887893
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
888-
(tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
894+
(tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
889895
>;
890896
}
891897

@@ -912,8 +918,8 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
912918
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
913919
execvt:$exec, i32:$numvgprs,
914920
execvt:$fbexec, i64:$fbcallee),
915-
(tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
916-
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
921+
(tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
922+
SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
917923
>;
918924
}
919925
}

llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
186186
for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
187187
MI.removeOperand(OpIdx);
188188

189-
MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
189+
MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
190190
}
191191

192192
void SILateBranchLowering::earlyTerm(MachineInstr &MI,

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
2222
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
2323
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
2424
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
25-
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
25+
; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
2626
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
2727
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
2828
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -51,7 +51,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
5151
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
5252
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
5353
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
54-
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
54+
; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
5555
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
5656
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
5757
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -86,7 +86,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
8686
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
8787
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
8888
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
89-
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
89+
; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
9090
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
9191
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
9292
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -115,7 +115,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
115115
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
116116
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
117117
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
118-
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
118+
; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
119119
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
120120
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
121121
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)

llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
66
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
7-
declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn
87

98
define amdgpu_cs_chain void @dynamic_vgprs(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 inreg %num_vgpr) {
109
; GISEL-GFX12-LABEL: dynamic_vgprs:
@@ -94,4 +93,45 @@ define amdgpu_cs_chain void @constants(<3 x i32> inreg %sgpr, { i32, ptr addrspa
9493
unreachable
9594
}
9695

96+
define amdgpu_cs_chain void @high_sgpr_pressure(<30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
97+
; GISEL-GFX12-LABEL: high_sgpr_pressure:
98+
; GISEL-GFX12: ; %bb.0:
99+
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
100+
; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
101+
; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
102+
; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
103+
; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
104+
; GISEL-GFX12-NEXT: s_mov_b32 s30, callee_high_sgpr@abs32@lo
105+
; GISEL-GFX12-NEXT: s_mov_b32 s31, callee_high_sgpr@abs32@hi
106+
; GISEL-GFX12-NEXT: s_mov_b32 s34, retry_vgpr_alloc@abs32@lo
107+
; GISEL-GFX12-NEXT: s_mov_b32 s35, retry_vgpr_alloc@abs32@hi
108+
; GISEL-GFX12-NEXT: s_alloc_vgpr 64
109+
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
110+
; GISEL-GFX12-NEXT: s_cselect_b64 s[30:31], s[30:31], s[34:35]
111+
; GISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
112+
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
113+
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
114+
;
115+
; DAGISEL-GFX12-LABEL: high_sgpr_pressure:
116+
; DAGISEL-GFX12: ; %bb.0:
117+
; DAGISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
118+
; DAGISEL-GFX12-NEXT: s_wait_expcnt 0x0
119+
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
120+
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
121+
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
122+
; DAGISEL-GFX12-NEXT: s_mov_b32 s31, retry_vgpr_alloc@abs32@hi
123+
; DAGISEL-GFX12-NEXT: s_mov_b32 s30, retry_vgpr_alloc@abs32@lo
124+
; DAGISEL-GFX12-NEXT: s_mov_b32 s35, callee_high_sgpr@abs32@hi
125+
; DAGISEL-GFX12-NEXT: s_mov_b32 s34, callee_high_sgpr@abs32@lo
126+
; DAGISEL-GFX12-NEXT: s_alloc_vgpr 64
127+
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
128+
; DAGISEL-GFX12-NEXT: s_cselect_b64 s[34:35], s[34:35], s[30:31]
129+
; DAGISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
130+
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
131+
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[34:35]
132+
call void(ptr, i32, <30 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_high_sgpr, i32 7, <30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 1, i32 inreg 64, i32 inreg -1, ptr @retry_vgpr_alloc)
133+
unreachable
134+
}
135+
136+
declare amdgpu_cs_chain void @callee_high_sgpr(<30 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
97137
declare amdgpu_cs_chain_preserve void @retry_vgpr_alloc(<3 x i32> inreg %sgpr)

0 commit comments

Comments
 (0)