Skip to content

Commit 2e2b216

Browse files
committed
[AMDGPU] Add liverange split instructions into BB Prolog
The COPY inserted for liverange split during sgpr-regalloc pipeline currently breaks the BB prolog during the subsequent vgpr-regalloc phase while spilling and/or splitting the vector liveranges. This patch fixes it by correctly including the the LR split instructions during sgpr-regalloc and wwm-regalloc pipelines into the BB prolog.
1 parent 90f830f commit 2e2b216

File tree

4 files changed

+148
-132
lines changed

4 files changed

+148
-132
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8956,6 +8956,30 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
89568956
return AMDGPU::COPY;
89578957
}
89588958

8959+
bool SIInstrInfo::canAddToBBProlog(const MachineInstr &MI) const {
8960+
uint16_t Opcode = MI.getOpcode();
8961+
// Check if it is SGPR spill or wwm-register spill Opcode.
8962+
if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
8963+
return true;
8964+
8965+
const MachineFunction *MF = MI.getMF();
8966+
const MachineRegisterInfo &MRI = MF->getRegInfo();
8967+
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
8968+
8969+
// See if this is Liverange split instruction inserted for SGPR or
8970+
// wwm-register. The implicit def inserted for wwm-registers should also be
8971+
// included as they can appear at the bb begin.
8972+
bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
8973+
if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
8974+
return false;
8975+
8976+
Register Reg = MI.getOperand(0).getReg();
8977+
if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
8978+
return IsLRSplitInst;
8979+
8980+
return MFI->isWWMReg(Reg);
8981+
}
8982+
89598983
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89608984
Register Reg) const {
89618985
// We need to handle instructions which may be inserted during register
@@ -8964,20 +8988,16 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89648988
// needed by the prolog. However, the insertions for scalar registers can
89658989
// always be placed at the BB top as they are independent of the exec mask
89668990
// value.
8967-
const MachineFunction *MF = MI.getParent()->getParent();
89688991
bool IsNullOrVectorRegister = true;
89698992
if (Reg) {
8993+
const MachineFunction *MF = MI.getMF();
89708994
const MachineRegisterInfo &MRI = MF->getRegInfo();
89718995
IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
89728996
}
89738997

8974-
uint16_t Opcode = MI.getOpcode();
8975-
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
89768998
return IsNullOrVectorRegister &&
8977-
(isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
8978-
(Opcode == AMDGPU::IMPLICIT_DEF &&
8979-
MFI->isWWMReg(MI.getOperand(0).getReg())) ||
8980-
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8999+
(canAddToBBProlog(MI) ||
9000+
(!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
89819001
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
89829002
}
89839003

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13481348
bool isBasicBlockPrologue(const MachineInstr &MI,
13491349
Register Reg = Register()) const override;
13501350

1351+
bool canAddToBBProlog(const MachineInstr &MI) const;
1352+
13511353
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
13521354
MachineBasicBlock::iterator InsPt,
13531355
const DebugLoc &DL, Register Src,

llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll

Lines changed: 57 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -176,39 +176,39 @@ define void @main(i1 %arg) #0 {
176176
; CHECK-NEXT: v_readlane_b32 s17, v7, 37
177177
; CHECK-NEXT: v_readlane_b32 s18, v7, 38
178178
; CHECK-NEXT: v_readlane_b32 s19, v7, 39
179-
; CHECK-NEXT: v_writelane_b32 v7, s4, 40
180-
; CHECK-NEXT: v_writelane_b32 v7, s5, 41
181-
; CHECK-NEXT: v_writelane_b32 v7, s6, 42
182-
; CHECK-NEXT: v_writelane_b32 v7, s7, 43
183-
; CHECK-NEXT: v_writelane_b32 v7, s8, 44
184-
; CHECK-NEXT: v_writelane_b32 v7, s9, 45
185-
; CHECK-NEXT: v_writelane_b32 v7, s10, 46
186-
; CHECK-NEXT: v_writelane_b32 v7, s11, 47
187-
; CHECK-NEXT: v_writelane_b32 v7, s12, 48
188-
; CHECK-NEXT: v_writelane_b32 v7, s13, 49
189-
; CHECK-NEXT: v_writelane_b32 v7, s14, 50
190-
; CHECK-NEXT: v_writelane_b32 v7, s15, 51
191-
; CHECK-NEXT: v_writelane_b32 v7, s16, 52
192-
; CHECK-NEXT: v_writelane_b32 v7, s17, 53
193-
; CHECK-NEXT: v_writelane_b32 v7, s18, 54
194-
; CHECK-NEXT: v_writelane_b32 v7, s19, 55
179+
; CHECK-NEXT: v_writelane_b32 v7, s4, 56
180+
; CHECK-NEXT: v_writelane_b32 v7, s5, 57
181+
; CHECK-NEXT: v_writelane_b32 v7, s6, 58
182+
; CHECK-NEXT: v_writelane_b32 v7, s7, 59
183+
; CHECK-NEXT: v_writelane_b32 v7, s8, 60
184+
; CHECK-NEXT: v_writelane_b32 v7, s9, 61
185+
; CHECK-NEXT: v_writelane_b32 v7, s10, 62
186+
; CHECK-NEXT: v_writelane_b32 v7, s11, 63
187+
; CHECK-NEXT: v_writelane_b32 v7, s52, 40
188+
; CHECK-NEXT: v_writelane_b32 v7, s53, 41
189+
; CHECK-NEXT: v_writelane_b32 v7, s54, 42
190+
; CHECK-NEXT: v_writelane_b32 v7, s55, 43
191+
; CHECK-NEXT: v_writelane_b32 v7, s56, 44
192+
; CHECK-NEXT: v_writelane_b32 v7, s57, 45
193+
; CHECK-NEXT: v_writelane_b32 v7, s58, 46
195194
; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
196-
; CHECK-NEXT: v_writelane_b32 v7, s52, 56
197-
; CHECK-NEXT: v_writelane_b32 v6, s60, 0
198-
; CHECK-NEXT: v_writelane_b32 v7, s53, 57
199-
; CHECK-NEXT: v_writelane_b32 v6, s61, 1
200-
; CHECK-NEXT: v_writelane_b32 v7, s54, 58
201-
; CHECK-NEXT: v_writelane_b32 v6, s62, 2
202-
; CHECK-NEXT: v_writelane_b32 v7, s55, 59
203-
; CHECK-NEXT: v_writelane_b32 v6, s63, 3
204-
; CHECK-NEXT: v_writelane_b32 v7, s56, 60
205-
; CHECK-NEXT: v_writelane_b32 v6, s64, 4
206-
; CHECK-NEXT: v_writelane_b32 v7, s57, 61
207-
; CHECK-NEXT: v_writelane_b32 v6, s65, 5
208-
; CHECK-NEXT: v_writelane_b32 v7, s58, 62
209-
; CHECK-NEXT: v_writelane_b32 v6, s66, 6
210-
; CHECK-NEXT: v_writelane_b32 v7, s59, 63
211-
; CHECK-NEXT: v_writelane_b32 v6, s67, 7
195+
; CHECK-NEXT: v_writelane_b32 v7, s59, 47
196+
; CHECK-NEXT: v_writelane_b32 v6, s12, 0
197+
; CHECK-NEXT: v_writelane_b32 v7, s60, 48
198+
; CHECK-NEXT: v_writelane_b32 v6, s13, 1
199+
; CHECK-NEXT: v_writelane_b32 v7, s61, 49
200+
; CHECK-NEXT: v_writelane_b32 v6, s14, 2
201+
; CHECK-NEXT: v_writelane_b32 v7, s62, 50
202+
; CHECK-NEXT: v_writelane_b32 v6, s15, 3
203+
; CHECK-NEXT: v_writelane_b32 v7, s63, 51
204+
; CHECK-NEXT: v_writelane_b32 v6, s16, 4
205+
; CHECK-NEXT: v_writelane_b32 v7, s64, 52
206+
; CHECK-NEXT: v_writelane_b32 v6, s17, 5
207+
; CHECK-NEXT: v_writelane_b32 v7, s65, 53
208+
; CHECK-NEXT: v_writelane_b32 v6, s18, 6
209+
; CHECK-NEXT: v_writelane_b32 v7, s66, 54
210+
; CHECK-NEXT: v_writelane_b32 v6, s19, 7
211+
; CHECK-NEXT: v_writelane_b32 v7, s67, 55
212212
; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27]
213213
; CHECK-NEXT: s_cbranch_execz .LBB0_10
214214
; CHECK-NEXT: ; %bb.4: ; %bb32
@@ -264,62 +264,60 @@ define void @main(i1 %arg) #0 {
264264
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
265265
; CHECK-NEXT: ; implicit-def: $vgpr0
266266
; CHECK-NEXT: .LBB0_6: ; %Flow12
267-
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23]
268-
; CHECK-NEXT: v_readlane_b32 s52, v7, 40
269-
; CHECK-NEXT: v_readlane_b32 s53, v7, 41
270-
; CHECK-NEXT: v_readlane_b32 s54, v7, 42
271-
; CHECK-NEXT: v_readlane_b32 s55, v7, 43
272-
; CHECK-NEXT: v_readlane_b32 s56, v7, 44
273-
; CHECK-NEXT: v_readlane_b32 s57, v7, 45
274-
; CHECK-NEXT: v_readlane_b32 s58, v7, 46
275-
; CHECK-NEXT: v_readlane_b32 s59, v7, 47
276-
; CHECK-NEXT: v_readlane_b32 s60, v7, 48
277-
; CHECK-NEXT: v_readlane_b32 s61, v7, 49
278-
; CHECK-NEXT: v_readlane_b32 s62, v7, 50
279-
; CHECK-NEXT: v_readlane_b32 s63, v7, 51
280-
; CHECK-NEXT: v_readlane_b32 s64, v7, 52
281-
; CHECK-NEXT: v_readlane_b32 s65, v7, 53
282-
; CHECK-NEXT: v_readlane_b32 s66, v7, 54
283-
; CHECK-NEXT: v_readlane_b32 s67, v7, 55
284-
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
267+
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23]
285268
; CHECK-NEXT: s_cbranch_execz .LBB0_9
286269
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
287270
; CHECK-NEXT: s_mov_b32 s8, 0
288271
; CHECK-NEXT: s_mov_b32 s6, s8
272+
; CHECK-NEXT: v_readlane_b32 s36, v7, 40
289273
; CHECK-NEXT: s_mov_b32 s7, s8
290274
; CHECK-NEXT: v_mov_b32_e32 v1, s6
291-
; CHECK-NEXT: v_readlane_b32 s36, v7, 56
275+
; CHECK-NEXT: v_readlane_b32 s37, v7, 41
292276
; CHECK-NEXT: s_mov_b32 s9, s8
293277
; CHECK-NEXT: s_mov_b32 s10, s8
294278
; CHECK-NEXT: s_mov_b32 s11, s8
295279
; CHECK-NEXT: v_mov_b32_e32 v2, s7
280+
; CHECK-NEXT: v_readlane_b32 s38, v7, 42
281+
; CHECK-NEXT: v_readlane_b32 s39, v7, 43
282+
; CHECK-NEXT: v_readlane_b32 s40, v7, 44
283+
; CHECK-NEXT: v_readlane_b32 s41, v7, 45
284+
; CHECK-NEXT: v_readlane_b32 s42, v7, 46
285+
; CHECK-NEXT: v_readlane_b32 s43, v7, 47
286+
; CHECK-NEXT: v_readlane_b32 s44, v7, 48
287+
; CHECK-NEXT: v_readlane_b32 s45, v7, 49
288+
; CHECK-NEXT: v_readlane_b32 s46, v7, 50
289+
; CHECK-NEXT: v_readlane_b32 s47, v7, 51
290+
; CHECK-NEXT: v_readlane_b32 s48, v7, 52
291+
; CHECK-NEXT: v_readlane_b32 s49, v7, 53
292+
; CHECK-NEXT: v_readlane_b32 s50, v7, 54
293+
; CHECK-NEXT: v_readlane_b32 s51, v7, 55
294+
; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
295+
; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
296+
; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
297+
; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
298+
; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
299+
; CHECK-NEXT: v_readlane_b32 s36, v7, 56
296300
; CHECK-NEXT: v_readlane_b32 s37, v7, 57
297301
; CHECK-NEXT: v_readlane_b32 s38, v7, 58
298302
; CHECK-NEXT: v_readlane_b32 s39, v7, 59
299303
; CHECK-NEXT: v_readlane_b32 s40, v7, 60
300304
; CHECK-NEXT: v_readlane_b32 s41, v7, 61
301305
; CHECK-NEXT: v_readlane_b32 s42, v7, 62
302306
; CHECK-NEXT: v_readlane_b32 s43, v7, 63
303-
; CHECK-NEXT: s_nop 4
304-
; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
305-
; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
306307
; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2
307-
; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37]
308308
; CHECK-NEXT: s_and_b64 vcc, exec, 0
309309
; CHECK-NEXT: v_readlane_b32 s44, v6, 0
310310
; CHECK-NEXT: v_readlane_b32 s45, v6, 1
311311
; CHECK-NEXT: v_readlane_b32 s46, v6, 2
312312
; CHECK-NEXT: v_readlane_b32 s47, v6, 3
313+
; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[8:11] dmask:0x1
313314
; CHECK-NEXT: v_readlane_b32 s48, v6, 4
314315
; CHECK-NEXT: v_readlane_b32 s49, v6, 5
315316
; CHECK-NEXT: v_readlane_b32 s50, v6, 6
316317
; CHECK-NEXT: v_readlane_b32 s51, v6, 7
317-
; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39]
318-
; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41]
319-
; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43]
320318
; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
321319
; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
322-
; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
320+
; CHECK-NEXT: ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
323321
; CHECK-NEXT: s_waitcnt vmcnt(0)
324322
; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3
325323
; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0

0 commit comments

Comments
 (0)