Skip to content

Commit fa050ea

Browse files
authored
Reland: CodeGen: Record MMOs in finalizeBundle (#166689)
(original PR: #166210) This allows more accurate alias analysis to apply at the bundle level. This has a bunch of minor effects in post-RA scheduling that look mostly beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic). The pre-existing (and unchanged) test in CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a bundle with MMOs can be parsed successfully. v2: - use cloneMergedMemRefs - add another test to explicitly check the MMO bundling behavior v3: - use poison instead of undef to initialize the global variable in the test v4: - treat bundle memory accesses as never trivially disjoint
1 parent 28c6ed5 commit fa050ea

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+8011
-8602
lines changed

llvm/lib/CodeGen/MIRParser/MIParser.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
11611161
MemOperands.push_back(MemOp);
11621162
if (Token.isNewlineOrEOF())
11631163
break;
1164+
if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
1165+
break;
11641166
if (Token.isNot(MIToken::comma))
11651167
return error("expected ',' before the next machine memory operand");
11661168
lex();

llvm/lib/CodeGen/MachineInstrBundle.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
143143
SmallSet<Register, 8> KilledUseSet;
144144
SmallSet<Register, 8> UndefUseSet;
145145
SmallVector<std::pair<Register, Register>> TiedOperands;
146+
SmallVector<MachineInstr *> MemMIs;
146147
for (auto MII = FirstMI; MII != LastMI; ++MII) {
147148
// Debug instructions have no effects to track.
148149
if (MII->isDebugInstr())
@@ -206,6 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
206207
MIB.setMIFlag(MachineInstr::FrameSetup);
207208
if (MII->getFlag(MachineInstr::FrameDestroy))
208209
MIB.setMIFlag(MachineInstr::FrameDestroy);
210+
211+
if (MII->mayLoadOrStore())
212+
MemMIs.push_back(&*MII);
209213
}
210214

211215
for (Register Reg : LocalDefs) {
@@ -231,6 +235,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
231235
assert(UseIdx < ExternUses.size());
232236
MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
233237
}
238+
239+
MIB->cloneMergedMemRefs(MF, MemMIs);
234240
}
235241

236242
/// finalizeBundle - Same functionality as the previous finalizeBundle except

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3917,6 +3917,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
39173917
if (isLDSDMA(MIa) || isLDSDMA(MIb))
39183918
return false;
39193919

3920+
if (MIa.isBundle() || MIb.isBundle())
3921+
return false;
3922+
39203923
// TODO: Should we check the address space from the MachineMemOperand? That
39213924
// would allow us to distinguish objects we know don't alias based on the
39223925
// underlying address space, even if it was lowered to a different one,

llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
189189
; GFX10-NEXT: v_mov_b32_e32 v2, s1
190190
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
191191
; GFX10-NEXT: v_mov_b32_e32 v4, s4
192-
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
193192
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
194-
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
195193
; GFX10-NEXT: s_lshr_b32 s5, s5, 8
196194
; GFX10-NEXT: v_mov_b32_e32 v5, s0
197195
; GFX10-NEXT: s_lshr_b32 s0, s7, 8
198196
; GFX10-NEXT: v_mov_b32_e32 v6, s6
199-
; GFX10-NEXT: v_mov_b32_e32 v7, s1
200-
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
201197
; GFX10-NEXT: v_mov_b32_e32 v8, s5
202198
; GFX10-NEXT: v_mov_b32_e32 v9, s0
203199
; GFX10-NEXT: ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
208204
; GFX10-NEXT: ds_write_b8 v1, v8 offset:1
209205
; GFX10-NEXT: ds_write_b8 v1, v9 offset:5
210206
; GFX10-NEXT: v_mov_b32_e32 v0, s8
211-
; GFX10-NEXT: v_mov_b32_e32 v3, s2
212-
; GFX10-NEXT: v_mov_b32_e32 v10, s1
207+
; GFX10-NEXT: s_lshr_b32 s1, s1, 24
208+
; GFX10-NEXT: s_and_b32 s9, 0xffff, s2
213209
; GFX10-NEXT: s_lshr_b32 s0, s2, 24
214-
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
215-
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
216-
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
210+
; GFX10-NEXT: v_mov_b32_e32 v7, s1
211+
; GFX10-NEXT: s_lshr_b32 s1, s9, 8
212+
; GFX10-NEXT: v_mov_b32_e32 v3, s2
217213
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
218214
; GFX10-NEXT: v_mov_b32_e32 v0, s0
219215
; GFX10-NEXT: s_and_b32 s0, 0xffff, s3
220-
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
216+
; GFX10-NEXT: v_mov_b32_e32 v10, s1
221217
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
218+
; GFX10-NEXT: s_lshr_b32 s1, s3, 16
222219
; GFX10-NEXT: v_mov_b32_e32 v2, s3
220+
; GFX10-NEXT: ds_write_b8 v1, v7 offset:7
221+
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
222+
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
223223
; GFX10-NEXT: v_mov_b32_e32 v3, s0
224224
; GFX10-NEXT: s_lshr_b32 s0, s3, 24
225225
; GFX10-NEXT: v_mov_b32_e32 v4, s1

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
272272
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273273
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274274
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
275-
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
276-
; GFX906-NEXT: s_nop 0
277-
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
278-
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
279275
; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
280276
; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
281277
; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
288284
; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
289285
; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
290286
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
287+
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
288+
; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
289+
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
291290
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
292291
; GFX906-NEXT: s_cbranch_execz .LBB6_2
293292
; GFX906-NEXT: ; %bb.1: ; %bb.1

0 commit comments

Comments
 (0)