Skip to content

Commit df1f231

Browse files
committed
[AMDGPU] Improve isBasicBlockPrologue to only add necessary instructions
1 parent a2ba438 commit df1f231

18 files changed

+735
-829
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8901,6 +8901,22 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
89018901
return AMDGPU::COPY;
89028902
}
89038903

8904+
bool SIInstrInfo::isPrologueOperandReload(const MachineInstr &MI) const {
8905+
unsigned Opcode = MI.getOpcode();
8906+
if ((isSGPRSpill(MI) &&
8907+
(MI.mayLoad() || Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)) ||
8908+
(isWWMRegSpillOpcode(Opcode) && MI.mayLoad())) {
8909+
Register Reg = MI.defs().begin()->getReg();
8910+
const MachineBasicBlock *MBB = MI.getParent();
8911+
MachineBasicBlock::const_instr_iterator I(MI), E = MBB->instr_end();
8912+
while (++I != E) {
8913+
if (I->readsRegister(Reg, &RI) && isBasicBlockPrologue(*I))
8914+
return true;
8915+
}
8916+
}
8917+
return false;
8918+
}
8919+
89048920
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89058921
Register Reg) const {
89068922
// We need to handle instructions which may be inserted during register
@@ -8917,8 +8933,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89178933

89188934
uint16_t Opcode = MI.getOpcode();
89198935
return IsNullOrVectorRegister &&
8920-
(isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
8921-
Opcode == AMDGPU::IMPLICIT_DEF ||
8936+
(isPrologueOperandReload(MI) || Opcode == AMDGPU::IMPLICIT_DEF ||
89228937
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
89238938
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
89248939
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13421342
bool isBasicBlockPrologue(const MachineInstr &MI,
13431343
Register Reg = Register()) const override;
13441344

1345+
bool isPrologueOperandReload(const MachineInstr &MI) const;
1346+
13451347
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
13461348
MachineBasicBlock::iterator InsPt,
13471349
const DebugLoc &DL, Register Src,

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
6868
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
6969
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7070
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
71-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
72-
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
73-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7471
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
7572
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
7673
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -87,7 +84,10 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
8784
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
8885
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
8986
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
90-
; CHECK-NEXT: s_waitcnt vmcnt(0)
87+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
88+
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
89+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
90+
; CHECK-NEXT: s_waitcnt vmcnt(1)
9191
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
9292
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
9393
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -104,6 +104,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
104104
; CHECK-NEXT: s_mov_b32 s17, s6
105105
; CHECK-NEXT: s_mov_b32 s18, s5
106106
; CHECK-NEXT: s_mov_b32 s19, s4
107+
; CHECK-NEXT: s_waitcnt vmcnt(0)
107108
; CHECK-NEXT: v_writelane_b32 v16, s12, 5
108109
; CHECK-NEXT: v_writelane_b32 v16, s13, 6
109110
; CHECK-NEXT: v_writelane_b32 v16, s14, 7
@@ -137,6 +138,8 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
137138
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
138139
; CHECK-NEXT: s_mov_b32 exec_lo, s21
139140
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
141+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
142+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
140143
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
141144
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
142145
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -154,9 +157,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
154157
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
155158
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
156159
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
159-
; CHECK-NEXT: s_waitcnt vmcnt(0)
160160
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161161
; CHECK-NEXT: s_waitcnt vmcnt(0)
162162
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -67,21 +67,20 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
6767
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
6868
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
6969
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
70+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
7071
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
7172
; GCN-O0-NEXT: s_waitcnt expcnt(0)
7273
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
7374
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
7475
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
7576
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
7677
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
77-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
7878
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
7979
; GCN-O0-NEXT: s_mov_b32 s0, 0
8080
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
8181
; GCN-O0-NEXT: s_mov_b32 s1, s2
8282
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
8383
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
84-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
8584
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
8685
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
8786
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
@@ -100,16 +99,16 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
10099
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
101100
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
102101
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
102+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
103+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
103104
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
104105
; GCN-O0-NEXT: s_waitcnt expcnt(0)
105106
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
106107
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
107108
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
108109
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
109110
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
110-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
111111
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
112-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
113112
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
114113
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
115114
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -236,21 +235,20 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
236235
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
237236
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
238237
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
238+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
239239
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
240240
; GCN-O0-NEXT: s_waitcnt expcnt(0)
241241
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
242242
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
243243
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
244244
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
245245
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
246-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
247246
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
248247
; GCN-O0-NEXT: s_mov_b32 s0, 0
249248
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
250249
; GCN-O0-NEXT: s_mov_b32 s1, s2
251250
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
252251
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
253-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
254252
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
255253
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
256254
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
@@ -269,16 +267,16 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
269267
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
270268
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
271269
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
270+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
271+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
272272
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
273273
; GCN-O0-NEXT: s_waitcnt expcnt(0)
274274
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
275275
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
276276
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
277277
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
278278
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
279-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
280279
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
281-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
282280
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
283281
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
284282
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -312,9 +310,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
312310
; GCN-O0-NEXT: v_readlane_b32 s2, v4, 4
313311
; GCN-O0-NEXT: v_readlane_b32 s3, v4, 5
314312
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
313+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
315314
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
316315
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
317-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
318316
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
319317
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
320318
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
@@ -456,17 +454,18 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
456454
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
457455
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
458456
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
457+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
459458
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
460459
; GCN-O0-NEXT: s_waitcnt expcnt(0)
461460
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
462461
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
463-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
464462
; GCN-O0-NEXT: s_mov_b32 s0, 2
465-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
463+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
466464
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
467465
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
468466
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
469467
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
468+
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
470469
; GCN-O0-NEXT: v_writelane_b32 v4, s2, 4
471470
; GCN-O0-NEXT: v_writelane_b32 v4, s3, 5
472471
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -493,16 +492,15 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
493492
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
494493
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
495494
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
495+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
496496
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
497497
; GCN-O0-NEXT: s_waitcnt expcnt(0)
498498
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
499499
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
500500
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
501501
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
502502
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
503-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
504503
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
505-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
506504
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
507505
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
508506
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -518,16 +516,15 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
518516
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
519517
; GCN-O0-NEXT: s_branch .LBB2_5
520518
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
519+
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
521520
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
522521
; GCN-O0-NEXT: s_waitcnt expcnt(0)
523522
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
524523
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
525524
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
526525
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
527526
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
528-
; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
529527
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
530-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
531528
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
532529
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
533530
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -724,13 +721,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
724721
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
725722
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
726723
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
724+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
725+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
726+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
727727
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
728728
; GCN-O0-NEXT: s_waitcnt expcnt(0)
729729
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
730730
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
731-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
732-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
733-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
734731
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
735732
; GCN-O0-NEXT: s_mov_b32 s2, 0
736733
; GCN-O0-NEXT: s_mov_b32 s4, s2
@@ -740,11 +737,12 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
740737
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
741738
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
742739
; GCN-O0-NEXT: v_mov_b32_e32 v1, 1
743-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
740+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
744741
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
745742
; GCN-O0-NEXT: s_mov_b32 s0, 2
746743
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
747744
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
745+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
748746
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 4
749747
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 5
750748
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -770,13 +768,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
770768
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
771769
; GCN-O0-NEXT: s_branch .LBB3_7
772770
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
771+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
772+
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
773+
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
773774
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
774775
; GCN-O0-NEXT: s_waitcnt expcnt(0)
775776
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
776777
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
777-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
778-
; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
779-
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
780778
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
781779
; GCN-O0-NEXT: s_mov_b32 s0, 0
782780
; GCN-O0-NEXT: s_mov_b32 s2, s0
@@ -786,10 +784,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
786784
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
787785
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
788786
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
789-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
787+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
790788
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
791789
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
792790
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
791+
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
793792
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 6
794793
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 7
795794
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -927,21 +926,20 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
927926
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
928927
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
929928
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
929+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
930930
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
931931
; GCN-O0-NEXT: s_waitcnt expcnt(0)
932932
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
933933
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
934934
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
935935
; GCN-O0-NEXT: v_readlane_b32 s0, v3, 0
936936
; GCN-O0-NEXT: v_readlane_b32 s1, v3, 1
937-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
938937
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
939938
; GCN-O0-NEXT: s_mov_b32 s4, 0
940939
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
941940
; GCN-O0-NEXT: s_mov_b32 s5, s2
942941
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
943942
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
944-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
945943
; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0
946944
; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
947945
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -1066,6 +1064,8 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
10661064
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
10671065
; GCN-O0-NEXT: .LBB5_1: ; %bb1
10681066
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
1067+
; GCN-O0-NEXT: s_waitcnt expcnt(1)
1068+
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
10691069
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
10701070
; GCN-O0-NEXT: s_waitcnt expcnt(0)
10711071
; GCN-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -1077,9 +1077,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
10771077
; GCN-O0-NEXT: v_readlane_b32 s7, v6, 1
10781078
; GCN-O0-NEXT: v_writelane_b32 v6, s6, 4
10791079
; GCN-O0-NEXT: v_writelane_b32 v6, s7, 5
1080-
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
10811080
; GCN-O0-NEXT: s_mov_b32 s4, 0x207
1082-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
10831081
; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4
10841082
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
10851083
; GCN-O0-NEXT: v_writelane_b32 v6, s4, 6
@@ -1279,14 +1277,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12791277
; GCN-O0-NEXT: v_readlane_b32 s8, v6, 16
12801278
; GCN-O0-NEXT: v_readlane_b32 s9, v6, 17
12811279
; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9]
1282-
; GCN-O0-NEXT: v_readlane_b32 s6, v6, 4
1283-
; GCN-O0-NEXT: v_readlane_b32 s7, v6, 5
1284-
; GCN-O0-NEXT: v_readlane_b32 s4, v6, 14
1285-
; GCN-O0-NEXT: v_readlane_b32 s5, v6, 15
12861280
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
12871281
; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
12881282
; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
12891283
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
1284+
; GCN-O0-NEXT: v_readlane_b32 s6, v6, 4
1285+
; GCN-O0-NEXT: v_readlane_b32 s7, v6, 5
1286+
; GCN-O0-NEXT: v_readlane_b32 s4, v6, 14
1287+
; GCN-O0-NEXT: v_readlane_b32 s5, v6, 15
12901288
; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5]
12911289
; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
12921290
; GCN-O0-NEXT: s_mov_b64 s[6:7], 0

0 commit comments

Comments
 (0)