Skip to content

Commit 304d2ff

Browse files
authored
CodeGen: Record MMOs in finalizeBundle (#166210)
This allows more accurate alias analysis to apply at the bundle level. This has a bunch of minor effects in post-RA scheduling that look mostly beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic). The pre-existing (and unchanged) test in CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a bundle with MMOs can be parsed successfully. v2: - use cloneMergedMemRefs - add another test to explicitly check the MMO bundling behavior v3: - use poison instead of undef to initialize the global variable in the test
1 parent 19a9de0 commit 304d2ff

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+10762
-11345
lines changed

llvm/lib/CodeGen/MIRParser/MIParser.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
11611161
MemOperands.push_back(MemOp);
11621162
if (Token.isNewlineOrEOF())
11631163
break;
1164+
if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
1165+
break;
11641166
if (Token.isNot(MIToken::comma))
11651167
return error("expected ',' before the next machine memory operand");
11661168
lex();

llvm/lib/CodeGen/MachineInstrBundle.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
137137
SmallSet<Register, 8> KilledUseSet;
138138
SmallSet<Register, 8> UndefUseSet;
139139
SmallVector<std::pair<Register, Register>> TiedOperands;
140+
SmallVector<MachineInstr *> MemMIs;
140141
for (auto MII = FirstMI; MII != LastMI; ++MII) {
141142
// Debug instructions have no effects to track.
142143
if (MII->isDebugInstr())
@@ -200,6 +201,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
200201
MIB.setMIFlag(MachineInstr::FrameSetup);
201202
if (MII->getFlag(MachineInstr::FrameDestroy))
202203
MIB.setMIFlag(MachineInstr::FrameDestroy);
204+
205+
if (MII->mayLoadOrStore())
206+
MemMIs.push_back(&*MII);
203207
}
204208

205209
for (Register Reg : LocalDefs) {
@@ -225,6 +229,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
225229
assert(UseIdx < ExternUses.size());
226230
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
227231
}
232+
233+
MIB->cloneMergedMemRefs(MF, MemMIs);
228234
}
229235

230236
/// finalizeBundle - Same functionality as the previous finalizeBundle except

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
3333
; GCN-NEXT: v_mov_b32_e32 v13, s49
3434
; GCN-NEXT: v_mov_b32_e32 v14, s50
3535
; GCN-NEXT: v_mov_b32_e32 v15, s51
36-
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
3736
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
3837
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4
3938
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -51,6 +50,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
5150
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56
5251
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60
5352
; GCN-NEXT: v_mov_b32_e32 v0, s52
53+
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
5454
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
5555
; GCN-NEXT: v_mov_b32_e32 v0, s53
5656
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68

llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
189189
; GFX10-NEXT: v_mov_b32_e32 v2, s1
190190
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
191191
; GFX10-NEXT: v_mov_b32_e32 v4, s4
192-
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
193192
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
194-
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
195193
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
196194
; GFX10-NEXT: v_mov_b32_e32 v5, s0
197195
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
198196
; GFX10-NEXT: v_mov_b32_e32 v6, s6
199-
; GFX10-NEXT: v_mov_b32_e32 v7, s1
200-
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
201197
; GFX10-NEXT: v_mov_b32_e32 v8, s5
202198
; GFX10-NEXT: v_mov_b32_e32 v9, s0
203199
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
208204
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
209205
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
210206
; GFX10-NEXT: v_mov_b32_e32 v0, s8
211-
; GFX10-NEXT: v_mov_b32_e32 v3, s2
212-
; GFX10-NEXT: v_mov_b32_e32 v10, s1
207+
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
208+
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
213209
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
214-
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
215-
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
216-
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
210+
; GFX10-NEXT: v_mov_b32_e32 v7, s1
211+
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
212+
; GFX10-NEXT: v_mov_b32_e32 v3, s2
217213
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
218214
; GFX10-NEXT: v_mov_b32_e32 v0, s0
219215
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
220-
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
216+
; GFX10-NEXT: v_mov_b32_e32 v10, s1
221217
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
218+
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
222219
; GFX10-NEXT: v_mov_b32_e32 v2, s3
220+
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
221+
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
222+
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
223223
; GFX10-NEXT: v_mov_b32_e32 v3, s0
224224
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
225225
; GFX10-NEXT: v_mov_b32_e32 v4, s1

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
272272
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273273
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274274
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
275-
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
276-
; GFX906-NEXT: s_nop 0
277-
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
278-
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
279275
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
280276
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
281277
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
288284
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
289285
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
290286
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
287+
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
288+
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
289+
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
291290
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
292291
; GFX906-NEXT: s_cbranch_execz .LBB6_2
293292
; GFX906-NEXT: ; %bb.1: ; %bb.1

0 commit comments

Comments
 (0)