Skip to content

Commit ed38d67

Browse files
authored
PeepholeOpt: Handle subregister compose when looking through reg_sequence (llvm#127051)
Previously this would give up on folding subregister copies through a reg_sequence if the input operand already had a subregister index. d246cc6 stopped introducing these subregister uses, and this is the first step to lifting that restriction. I was expecting to be able to implement this only purely with compose / reverse compose, but I wasn't able to make it work so relies on testing the lanemasks for whether the copy reads a subset of the input.
1 parent 8fe290e commit ed38d67

File tree

4 files changed

+94
-61
lines changed

4 files changed

+94
-61
lines changed

llvm/lib/CodeGen/PeepholeOptimizer.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1984,12 +1984,43 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
19841984

19851985
// We are looking at:
19861986
// Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
1987-
// Check if one of the operand defines the subreg we are interested in.
1987+
//
1988+
// Check if one of the operands exactly defines the subreg we are interested
1989+
// in.
19881990
for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) {
19891991
if (RegSeqInput.SubIdx == DefSubReg)
19901992
return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg);
19911993
}
19921994

1995+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
1996+
1997+
// If we did not find an exact match, see if we can do a composition to
1998+
// extract a sub-subregister.
1999+
for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) {
2000+
// We don't check if the resulting class supports the subregister index
2001+
// yet. This will occur before any rewrite when looking for an eligible
2002+
// source.
2003+
2004+
LaneBitmask DefMask = TRI->getSubRegIndexLaneMask(DefSubReg);
2005+
LaneBitmask ThisOpRegMask = TRI->getSubRegIndexLaneMask(RegSeqInput.SubIdx);
2006+
2007+
// Check that this extract reads a subset of this single reg_sequence input.
2008+
//
2009+
// FIXME: We should be able to filter this in terms of the indexes directly
2010+
// without checking the lanemasks.
2011+
if ((DefMask & ThisOpRegMask) != DefMask)
2012+
continue;
2013+
2014+
unsigned ReverseDefCompose =
2015+
TRI->reverseComposeSubRegIndices(RegSeqInput.SubIdx, DefSubReg);
2016+
if (!ReverseDefCompose)
2017+
continue;
2018+
2019+
unsigned ComposedDefInSrcReg1 =
2020+
TRI->composeSubRegIndices(RegSeqInput.SubReg, ReverseDefCompose);
2021+
return ValueTrackerResult(RegSeqInput.Reg, ComposedDefInSrcReg1);
2022+
}
2023+
19932024
// If the subreg we are tracking is super-defined by another subreg,
19942025
// we could follow this value. However, this would require to compose
19952026
// the subreg and we do not do that for now.

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,8 +2872,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
28722872
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
28732873
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
28742874
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
2875-
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
2876-
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
2875+
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2876+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc
28772877
; GCN-NEXT: s_setpc_b64 s[30:31]
28782878
;
28792879
; GFX10-LABEL: dyn_extract_v7f64_v_v:
@@ -2898,8 +2898,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
28982898
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
28992899
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
29002900
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14
2901-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
2902-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
2901+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc_lo
2902+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc_lo
29032903
; GFX10-NEXT: s_setpc_b64 s[30:31]
29042904
;
29052905
; GFX11-LABEL: dyn_extract_v7f64_v_v:
@@ -2918,7 +2918,7 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
29182918
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14
29192919
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
29202920
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14
2921-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
2921+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v0 :: v_dual_cndmask_b32 v1, v1, v1
29222922
; GFX11-NEXT: s_setpc_b64 s[30:31]
29232923
entry:
29242924
%ext = extractelement <7 x double> %vec, i32 %sel

llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,9 @@ define void @issue92561(ptr addrspace(1) %arg) {
7979
; GISEL: ; %bb.0: ; %bb
8080
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8181
; GISEL-NEXT: s_clause 0x1
82-
; GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off
83-
; GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
84-
; GISEL-NEXT: v_mov_b32_e32 v0, 0
82+
; GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off
83+
; GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
84+
; GISEL-NEXT: v_mov_b32_e32 v8, 0
8585
; GISEL-NEXT: s_mov_b32 s20, 0
8686
; GISEL-NEXT: s_mov_b32 s3, exec_lo
8787
; GISEL-NEXT: s_mov_b32 s21, s20
@@ -97,49 +97,51 @@ define void @issue92561(ptr addrspace(1) %arg) {
9797
; GISEL-NEXT: s_mov_b32 s11, s20
9898
; GISEL-NEXT: s_waitcnt vmcnt(0)
9999
; GISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
100-
; GISEL-NEXT: v_readfirstlane_b32 s12, v2
101-
; GISEL-NEXT: v_readfirstlane_b32 s13, v3
102-
; GISEL-NEXT: v_readfirstlane_b32 s14, v4
103-
; GISEL-NEXT: v_readfirstlane_b32 s15, v5
104-
; GISEL-NEXT: v_readfirstlane_b32 s16, v6
105-
; GISEL-NEXT: v_readfirstlane_b32 s17, v7
106-
; GISEL-NEXT: v_readfirstlane_b32 s18, v8
107-
; GISEL-NEXT: v_readfirstlane_b32 s19, v9
108-
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[2:3]
109-
; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[4:5]
110-
; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[6:7]
100+
; GISEL-NEXT: v_readfirstlane_b32 s12, v4
101+
; GISEL-NEXT: v_readfirstlane_b32 s13, v5
102+
; GISEL-NEXT: v_readfirstlane_b32 s14, v6
103+
; GISEL-NEXT: v_readfirstlane_b32 s15, v7
104+
; GISEL-NEXT: v_readfirstlane_b32 s16, v0
105+
; GISEL-NEXT: v_readfirstlane_b32 s17, v1
106+
; GISEL-NEXT: v_readfirstlane_b32 s18, v2
107+
; GISEL-NEXT: v_readfirstlane_b32 s19, v3
108+
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5]
109+
; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7]
110+
; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1]
111111
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
112-
; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[8:9]
112+
; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3]
113113
; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0
114114
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
115115
; GISEL-NEXT: s_and_b32 s0, s0, s1
116116
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
117117
; GISEL-NEXT: s_and_b32 s0, s0, s2
118118
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
119119
; GISEL-NEXT: s_and_saveexec_b32 s0, s0
120-
; GISEL-NEXT: image_sample_c_lz v1, [v0, v0, v0, v0], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
121-
; GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
122-
; GISEL-NEXT: ; implicit-def: $vgpr0
120+
; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
121+
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
122+
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
123+
; GISEL-NEXT: ; implicit-def: $vgpr8
123124
; GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
124125
; GISEL-NEXT: s_cbranch_execnz .LBB0_1
125126
; GISEL-NEXT: ; %bb.2:
126127
; GISEL-NEXT: s_mov_b32 exec_lo, s3
127-
; GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0
128-
; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fc00000
128+
; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x7fc00000
129+
; GISEL-NEXT: v_mov_b32_e32 v2, 1.0
129130
; GISEL-NEXT: s_clause 0x2
130-
; GISEL-NEXT: image_sample_c_lz v0, [v2, v2, v0, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
131-
; GISEL-NEXT: image_sample_c_lz v3, [v2, v3, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
132-
; GISEL-NEXT: image_sample_c_lz v4, [v2, v2, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
131+
; GISEL-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
132+
; GISEL-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
133+
; GISEL-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
133134
; GISEL-NEXT: s_waitcnt vmcnt(2)
134-
; GISEL-NEXT: v_add_f32_e32 v0, v1, v0
135+
; GISEL-NEXT: v_add_f32_e32 v0, v9, v0
135136
; GISEL-NEXT: s_waitcnt vmcnt(1)
136-
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
137-
; GISEL-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v2
137+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
138+
; GISEL-NEXT: v_add_f32_e32 v0, v2, v0
139+
; GISEL-NEXT: v_mov_b32_e32 v2, v1
138140
; GISEL-NEXT: s_waitcnt vmcnt(0)
139-
; GISEL-NEXT: v_add_f32_e32 v0, v4, v0
141+
; GISEL-NEXT: v_add_f32_e32 v0, v3, v0
140142
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
141-
; GISEL-NEXT: v_mul_f32_e32 v1, 0x3e800000, v0
142-
; GISEL-NEXT: image_store v[1:3], [v2, v2], s[4:11] dim:SQ_RSRC_IMG_2D unorm
143+
; GISEL-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0
144+
; GISEL-NEXT: image_store v[0:2], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm
143145
; GISEL-NEXT: s_setpc_b64 s[30:31]
144146
bb:
145147
%descriptor = load <8 x i32>, ptr addrspace(1) %arg, align 32

0 commit comments

Comments
 (0)