Skip to content

Commit f0090ba

Browse files
rampitecjayfoad
andauthored
[AMDGPU] Fold copies of constant physical registers into their uses (#154410)
Co-authored-by: Jay Foad <[email protected]> Co-authored-by: Jay Foad <[email protected]>
1 parent 6db244a commit f0090ba

File tree

5 files changed

+475
-403
lines changed

5 files changed

+475
-403
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
710710
// Verify the register is compatible with the operand.
711711
if (const TargetRegisterClass *OpRC =
712712
TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
713-
const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
713+
const TargetRegisterClass *NewRC =
714+
TRI->getRegClassForReg(*MRI, New->getReg());
714715
const TargetRegisterClass *ConstrainRC =
715716
TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg());
716717
if (!ConstrainRC)
@@ -727,8 +728,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
727728
// 16-bit SGPRs instead of 32-bit ones.
728729
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
729730
Old.setSubReg(AMDGPU::NoSubRegister);
730-
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
731-
Old.setIsUndef(New->isUndef());
731+
if (New->getReg().isPhysical()) {
732+
Old.substPhysReg(New->getReg(), *TRI);
733+
} else {
734+
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
735+
Old.setIsUndef(New->isUndef());
736+
}
732737
return true;
733738
}
734739

@@ -1997,7 +2002,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
19972002
if (!FoldingImm && !OpToFold.isReg())
19982003
return false;
19992004

2000-
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
2005+
// Fold virtual registers and constant physical registers.
2006+
if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2007+
!TRI->isConstantPhysReg(OpToFold.getReg()))
20012008
return false;
20022009

20032010
// Prevent folding operands backwards in the function. For example,

llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
2727
;
2828
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
2929
; GFX1250-GISEL: ; %bb.0:
30-
; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
31-
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
30+
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
31+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
3232
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
33-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3433
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
35-
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1
36-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
37-
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
34+
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, -1
35+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
3837
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
39-
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0
40-
; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0
41-
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
38+
; GFX1250-GISEL-NEXT: s_cselect_b32 s1, 1, 0
39+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
40+
; GFX1250-GISEL-NEXT: s_and_b32 s1, 1, s1
4241
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
43-
; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
42+
; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
43+
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
4444
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
4545
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
4646
; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
@@ -69,14 +69,13 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
6969
;
7070
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
7171
; GFX1250-GISEL: ; %bb.0:
72-
; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
73-
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
72+
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
7473
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
75-
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
74+
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
7675
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7776
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7877
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
79-
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
78+
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
8079
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
8180
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
8281
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0

0 commit comments

Comments
 (0)