Skip to content

Commit 6da68f8

Browse files
committed
AMDGPU: Fix verifier error when waterfall call target is in AV register
This isn't an ideal fix; technically this should be an optimization path we shouldn't need to go down. The base path where a copy will be inserted is still broken. The lit test changes are mostly regressions to be fixed later.
1 parent 557be13 commit 6da68f8

File tree

6 files changed

+1052
-706
lines changed

6 files changed

+1052
-706
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8178,26 +8178,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
81788178
return;
81798179
}
81808180

8181-
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
8182-
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
8183-
// Instead of creating a copy where src and dst are the same register
8184-
// class, we just replace all uses of dst with src. These kinds of
8185-
// copies interfere with the heuristics MachineSink uses to decide
8186-
// whether or not to split a critical edge. Since the pass assumes
8187-
// that copies will end up as machine instructions and not be
8188-
// eliminated.
8189-
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8181+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
81908182
Register NewDstReg = Inst.getOperand(1).getReg();
8191-
MRI.replaceRegWith(DstReg, NewDstReg);
8192-
MRI.clearKillFlags(NewDstReg);
8193-
Inst.getOperand(0).setReg(DstReg);
8194-
Inst.eraseFromParent();
8195-
// Legalize t16 operand since replaceReg is called after addUsersToVALU
8196-
for (MachineOperand &MO :
8197-
make_early_inc_range(MRI.use_operands(NewDstReg))) {
8198-
legalizeOperandsVALUt16(*MO.getParent(), MRI);
8183+
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8184+
if (const TargetRegisterClass *CommonRC =
8185+
RI.getCommonSubClass(NewDstRC, SrcRC)) {
8186+
// Instead of creating a copy where src and dst are the same register
8187+
// class, we just replace all uses of dst with src. These kinds of
8188+
// copies interfere with the heuristics MachineSink uses to decide
8189+
// whether or not to split a critical edge. Since the pass assumes
8190+
// that copies will end up as machine instructions and not be
8191+
// eliminated.
8192+
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8193+
MRI.replaceRegWith(DstReg, NewDstReg);
8194+
MRI.clearKillFlags(NewDstReg);
8195+
Inst.getOperand(0).setReg(DstReg);
8196+
8197+
if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8198+
llvm_unreachable("failed to constrain register");
8199+
8200+
Inst.eraseFromParent();
8201+
// Legalize t16 operand since replaceReg is called after addUsersToVALU
8202+
for (MachineOperand &MO :
8203+
make_early_inc_range(MRI.use_operands(NewDstReg))) {
8204+
legalizeOperandsVALUt16(*MO.getParent(), MRI);
8205+
}
8206+
8207+
return;
81998208
}
8200-
return;
82018209
}
82028210

82038211
// If this is a v2s copy between 16bit and 32bit reg,

llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10733,15 +10733,16 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
1073310733
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
1073410734
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
1073510735
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
10736+
; GFX90A-NEXT: s_waitcnt vmcnt(1)
10737+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1073610738
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1073710739
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
1073810740
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
10739-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1074010741
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
1074110742
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
10742-
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
10743+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
1074310744
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
10744-
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
10745+
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
1074510746
; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi
1074610747
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1074710748
; GFX90A-NEXT: ;;#ASMSTART
@@ -11000,15 +11001,16 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
1100011001
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
1100111002
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
1100211003
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
11004+
; GFX90A-NEXT: s_waitcnt vmcnt(1)
11005+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1100311006
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1100411007
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
1100511008
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
11006-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1100711009
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
1100811010
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
11009-
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
11011+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
1101011012
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
11011-
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
11013+
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
1101211014
; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi
1101311015
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1101411016
; GFX90A-NEXT: ;;#ASMSTART
@@ -19023,15 +19025,16 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1902319025
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
1902419026
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
1902519027
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
19028+
; GFX90A-NEXT: s_waitcnt vmcnt(1)
19029+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1902619030
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1902719031
; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5]
1902819032
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
19029-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1903019033
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
1903119034
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
19032-
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
19035+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
1903319036
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
19034-
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
19037+
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
1903519038
; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi
1903619039
; GFX90A-NEXT: ;;#ASMSTART
1903719040
; GFX90A-NEXT: ; use a[0:1]
@@ -19282,15 +19285,16 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
1928219285
; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen
1928319286
; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
1928419287
; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000
19288+
; GFX90A-NEXT: s_waitcnt vmcnt(1)
19289+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1928519290
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1928619291
; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5]
1928719292
; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
19288-
; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
1928919293
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
1929019294
; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
19291-
; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
19295+
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
1929219296
; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen
19293-
; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
19297+
; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
1929419298
; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi
1929519299
; GFX90A-NEXT: ;;#ASMSTART
1929619300
; GFX90A-NEXT: ; use a[0:1]

llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,26 +43,25 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr addrspace(5) %diverge
4343
; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg:
4444
; CHECK: ; %bb.0: ; %entry
4545
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46-
; CHECK-NEXT: s_lshr_b32 s6, s32, 6
4746
; CHECK-NEXT: v_mov_b32_e32 v7, v2
4847
; CHECK-NEXT: v_mov_b32_e32 v6, v1
4948
; CHECK-NEXT: s_mov_b64 s[4:5], 0
50-
; CHECK-NEXT: v_mov_b32_e32 v1, s6
49+
; CHECK-NEXT: v_lshrrev_b32_e64 v2, 6, s32
5150
; CHECK-NEXT: .LBB1_1: ; %loop
5251
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
53-
; CHECK-NEXT: v_add_u32_e32 v8, 1, v3
54-
; CHECK-NEXT: v_lshl_add_u32 v5, v3, 2, v1
55-
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v8
56-
; CHECK-NEXT: v_mov_b32_e32 v2, v1
57-
; CHECK-NEXT: v_mov_b32_e32 v1, v0
58-
; CHECK-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
52+
; CHECK-NEXT: v_mov_b32_e32 v1, v2
53+
; CHECK-NEXT: v_lshl_add_u32 v2, v3, 2, v1
54+
; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
55+
; CHECK-NEXT: v_add_u32_e32 v2, 1, v3
56+
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 15, v2
5957
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6058
; CHECK-NEXT: v_mov_b32_e32 v3, v4
59+
; CHECK-NEXT: v_mov_b32_e32 v2, v0
6160
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
6261
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
6362
; CHECK-NEXT: ; %bb.2: ; %done
6463
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
65-
; CHECK-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
64+
; CHECK-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
6665
; CHECK-NEXT: s_waitcnt vmcnt(0)
6766
; CHECK-NEXT: global_store_dword v[6:7], v0, off
6867
; CHECK-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)