Skip to content

Commit a47682d

Browse files
authored
[AMDGPU][gfx13] use s_add_pc_i64 for rank-call (#3161)
Make the rank-call more efficient.
1 parent 66b98bd commit a47682d

File tree

3 files changed

+19
-29
lines changed

3 files changed

+19
-29
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6836,15 +6836,19 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
68366836
.addReg(WaveIDInWaveGroup)
68376837
.addImm(Rank);
68386838
BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)).addMBB(SplitBB);
6839+
68396840
// Call inside the conditional branch.
68406841
Register CalleeAddrReg = MI.getOperand(1).getReg();
6841-
BuildMI(*RankCallBB, RankCallBB->end(), DL, TII->get(AMDGPU::S_SETPC_B64))
6842-
.addReg(CalleeAddrReg);
6842+
auto CalleeAddrDef = MRI.getVRegDef(CalleeAddrReg);
6843+
assert(CalleeAddrDef->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET64);
6844+
// Use s_add_pc_i64, bypass the address computation.
6845+
BuildMI(*RankCallBB, RankCallBB->end(), DL, TII->get(AMDGPU::S_ADD_PC_I64))
6846+
.addGlobalAddress(CalleeAddrDef->getOperand(1).getGlobal(), 0,
6847+
SIInstrInfo::MO_REL64);
6848+
68436849
// Update IDX0 for the next rank-call. Use the global address of the rank
68446850
// callee as the source. In AsmPrinter, it will be replaced with the
68456851
// MCSymbol representing the number of VGPRs of that callee.
6846-
auto CalleeAddrDef = MRI.getVRegDef(CalleeAddrReg);
6847-
assert(CalleeAddrDef->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET64);
68486852
BuildMI(*SplitBB, SplitBB->begin(), DL, TII->get(AMDGPU::S_ADD_GPR_IDX_U32),
68496853
AMDGPU::IDX0)
68506854
.addGlobalAddress(CalleeAddrDef->getOperand(1).getGlobal(), 0,

llvm/test/CodeGen/AMDGPU/rank-specialization-lowered.ll

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -219,47 +219,45 @@ define dso_local amdgpu_kernel void @test_kernel_1() local_unnamed_addr #1 !reqd
219219
; CHECK-NEXT: .LBB5_3: ; %bb.rank_0_2_3_4_5_6_7
220220
; CHECK-NEXT: s_set_gpr_idx_u32 idx0, 0
221221
; CHECK-NEXT: s_cmp_eq_u32 s9, 0
222-
; CHECK-NEXT: s_get_pc_i64 s[10:11]
223-
; CHECK-NEXT: s_add_nc_u64 s[10:11], s[10:11], test_kernel_1.rank_0_2_3_4_5_6_7@rel64+4
224222
; CHECK-NEXT: s_cbranch_scc0 .LBB5_5
225223
; CHECK-NEXT: ; %bb.4:
226-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
224+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
227225
; CHECK-NEXT: .LBB5_5: ; %bb.rank_0_2_3_4_5_6_7
228226
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
229227
; CHECK-NEXT: s_cmp_eq_u32 s9, 2
230228
; CHECK-NEXT: s_cbranch_scc0 .LBB5_7
231229
; CHECK-NEXT: ; %bb.6:
232-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
230+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
233231
; CHECK-NEXT: .LBB5_7: ; %bb.rank_0_2_3_4_5_6_7
234232
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
235233
; CHECK-NEXT: s_cmp_eq_u32 s9, 3
236234
; CHECK-NEXT: s_cbranch_scc0 .LBB5_9
237235
; CHECK-NEXT: ; %bb.8:
238-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
236+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
239237
; CHECK-NEXT: .LBB5_9: ; %bb.rank_0_2_3_4_5_6_7
240238
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
241239
; CHECK-NEXT: s_cmp_eq_u32 s9, 4
242240
; CHECK-NEXT: s_cbranch_scc0 .LBB5_11
243241
; CHECK-NEXT: ; %bb.10:
244-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
242+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
245243
; CHECK-NEXT: .LBB5_11: ; %bb.rank_0_2_3_4_5_6_7
246244
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
247245
; CHECK-NEXT: s_cmp_eq_u32 s9, 5
248246
; CHECK-NEXT: s_cbranch_scc0 .LBB5_13
249247
; CHECK-NEXT: ; %bb.12:
250-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
248+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
251249
; CHECK-NEXT: .LBB5_13: ; %bb.rank_0_2_3_4_5_6_7
252250
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
253251
; CHECK-NEXT: s_cmp_eq_u32 s9, 6
254252
; CHECK-NEXT: s_cbranch_scc0 .LBB5_15
255253
; CHECK-NEXT: ; %bb.14:
256-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
254+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
257255
; CHECK-NEXT: .LBB5_15: ; %bb.rank_0_2_3_4_5_6_7
258256
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
259257
; CHECK-NEXT: s_cmp_eq_u32 s9, 7
260258
; CHECK-NEXT: s_cbranch_scc0 .LBB5_17
261259
; CHECK-NEXT: ; %bb.16:
262-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
260+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_0_2_3_4_5_6_7@rel64
263261
; CHECK-NEXT: .LBB5_17: ; %bb.rank_0_2_3_4_5_6_7
264262
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr)
265263
; CHECK-NEXT: s_cbranch_execnz .LBB5_2
@@ -268,10 +266,7 @@ define dso_local amdgpu_kernel void @test_kernel_1() local_unnamed_addr #1 !reqd
268266
; CHECK-NEXT: s_cmp_eq_u32 s9, 1
269267
; CHECK-NEXT: s_cbranch_scc0 .LBB5_20
270268
; CHECK-NEXT: ; %bb.19:
271-
; CHECK-NEXT: s_get_pc_i64 s[10:11]
272-
; CHECK-NEXT: s_add_nc_u64 s[10:11], s[10:11], test_kernel_1.rank_1@rel64+4
273-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
274-
; CHECK-NEXT: s_set_pc_i64 s[10:11]
269+
; CHECK-NEXT: s_add_pc_i64 test_kernel_1.rank_1@rel64
275270
; CHECK-NEXT: .LBB5_20: ; %bb.rank_1
276271
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, max(128, dummy_store.num_vgpr, dummy_rank1a.num_vgpr, dummy_rank1b.num_vgpr)
277272
; CHECK-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/wavegroup-rank-func.ll

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -143,28 +143,19 @@ define amdgpu_kernel void @main(ptr addrspace(1) %inbuf, ptr addrspace(1) %wbuf,
143143
; CHECK-NEXT: s_cmp_eq_u32 s3, 0
144144
; CHECK-NEXT: s_cbranch_scc0 .LBB3_2
145145
; CHECK-NEXT: ; %bb.1:
146-
; CHECK-NEXT: s_get_pc_i64 s[4:5]
147-
; CHECK-NEXT: s_add_nc_u64 s[4:5], s[4:5], input@rel64+4
148-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
149-
; CHECK-NEXT: s_set_pc_i64 s[4:5]
146+
; CHECK-NEXT: s_add_pc_i64 input@rel64
150147
; CHECK-NEXT: .LBB3_2: ; %entry
151148
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, 18
152149
; CHECK-NEXT: s_cmp_eq_u32 s3, 1
153150
; CHECK-NEXT: s_cbranch_scc0 .LBB3_4
154151
; CHECK-NEXT: ; %bb.3:
155-
; CHECK-NEXT: s_get_pc_i64 s[4:5]
156-
; CHECK-NEXT: s_add_nc_u64 s[4:5], s[4:5], compute@rel64+4
157-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
158-
; CHECK-NEXT: s_set_pc_i64 s[4:5]
152+
; CHECK-NEXT: s_add_pc_i64 compute@rel64
159153
; CHECK-NEXT: .LBB3_4: ; %entry
160154
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, 0
161155
; CHECK-NEXT: s_cmp_eq_u32 s3, 2
162156
; CHECK-NEXT: s_cbranch_scc0 .LBB3_6
163157
; CHECK-NEXT: ; %bb.5:
164-
; CHECK-NEXT: s_get_pc_i64 s[4:5]
165-
; CHECK-NEXT: s_add_nc_u64 s[4:5], s[4:5], output@rel64+4
166-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
167-
; CHECK-NEXT: s_set_pc_i64 s[4:5]
158+
; CHECK-NEXT: s_add_pc_i64 output@rel64
168159
; CHECK-NEXT: .LBB3_6: ; %entry
169160
; CHECK-NEXT: s_add_gpr_idx_u32 idx0, 1
170161
; CHECK-NEXT: s_endpgm

0 commit comments

Comments
 (0)