Skip to content

Commit 8b45901

Browse files
ssahasrabcahoon
authored andcommitted
[AMDGPU][SIInsertWaitCnts] skip meta instructions early (llvm#145720)
When iterating over a block, meta instructions have no effect on wait counts, but their presence drops the reference to earlier waitcnt instructions before they are processed. This results in spurious wait counts, which do not affect correctness, but are also not required in the resulting program. Skipping meta instructions as soon as they are seen cleans this up. (cherry picked from commit a34a024)
1 parent 5d8141d commit 8b45901

11 files changed

+26
-81
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,8 +1616,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16161616
bool FlushVmCnt) {
16171617
setForceEmitWaitcnt();
16181618

1619-
if (MI.isMetaInstruction())
1620-
return false;
1619+
assert(!MI.isMetaInstruction());
16211620

16221621
AMDGPU::Waitcnt Wait;
16231622

@@ -2181,6 +2180,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
21812180
E = Block.instr_end();
21822181
Iter != E;) {
21832182
MachineInstr &Inst = *Iter;
2183+
if (Inst.isMetaInstruction()) {
2184+
++Iter;
2185+
continue;
2186+
}
21842187

21852188
// Track pre-existing waitcnts that were added in earlier iterations or by
21862189
// the memory legalizer.

llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,7 +2292,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
22922292
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
22932293
; GFX9-SDAG-NEXT: s_mov_b32 s34, s12
22942294
; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
2295-
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
22962295
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
22972296
;
22982297
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2359,7 +2358,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
23592358
; GFX9-GISEL-NEXT: s_mov_b32 s32, s34
23602359
; GFX9-GISEL-NEXT: s_mov_b32 s34, s12
23612360
; GFX9-GISEL-NEXT: s_mov_b32 s33, s11
2362-
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
23632361
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
23642362
;
23652363
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
620620
; GFX9-NEXT: s_waitcnt vmcnt(0)
621621
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
622622
; GFX9-NEXT: .LBB3_4: ; %exit
623-
; GFX9-NEXT: s_waitcnt vmcnt(0)
624623
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
625624
; GFX9-NEXT: s_movk_i32 s4, 0x8000
626625
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -822,7 +821,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
822821
; GFX9-NEXT: s_waitcnt vmcnt(0)
823822
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
824823
; GFX9-NEXT: .LBB4_4: ; %exit
825-
; GFX9-NEXT: s_waitcnt vmcnt(0)
826824
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
827825
; GFX9-NEXT: s_movk_i32 s4, 0x8000
828826
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -1029,7 +1027,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
10291027
; GFX9-NEXT: .LBB5_4: ; %exit
10301028
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
10311029
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1032-
; GFX9-NEXT: s_waitcnt vmcnt(0)
10331030
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
10341031
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
10351032
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1355,7 +1352,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
13551352
; GFX9-NEXT: s_movk_i32 s34, 0x3800
13561353
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
13571354
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1358-
; GFX9-NEXT: s_waitcnt vmcnt(0)
13591355
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7
13601356
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
13611357
; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
@@ -1647,7 +1643,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
16471643
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
16481644
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
16491645
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
1650-
; GFX9-NEXT: s_waitcnt vmcnt(0)
16511646
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
16521647
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
16531648
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/extract-subvector.ll

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
128128
; GCN-NEXT: s_mov_b32 s11, 0xf000
129129
; GCN-NEXT: s_mov_b32 s8, s10
130130
; GCN-NEXT: s_mov_b32 s9, s10
131-
; GCN-NEXT: s_waitcnt vmcnt(0)
132131
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
133132
; GCN-NEXT: s_waitcnt vmcnt(0)
134133
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -139,7 +138,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
139138
; GCN-NEXT: s_waitcnt vmcnt(0)
140139
; GCN-NEXT: .LBB1_4: ; %exit
141140
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
142-
; GCN-NEXT: s_waitcnt vmcnt(0)
143141
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
144142
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
145143
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
@@ -199,7 +197,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
199197
; GCN-NEXT: s_mov_b32 s11, 0xf000
200198
; GCN-NEXT: s_mov_b32 s8, s10
201199
; GCN-NEXT: s_mov_b32 s9, s10
202-
; GCN-NEXT: s_waitcnt vmcnt(0)
203200
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
204201
; GCN-NEXT: s_waitcnt vmcnt(0)
205202
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -210,7 +207,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
210207
; GCN-NEXT: s_waitcnt vmcnt(0)
211208
; GCN-NEXT: .LBB2_4: ; %exit
212209
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
213-
; GCN-NEXT: s_waitcnt vmcnt(0)
214210
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
215211
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
216212
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
@@ -308,7 +304,6 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
308304
; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
309305
; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
310306
; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
311-
; GCN-NEXT: s_waitcnt vmcnt(0)
312307
; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
313308
; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
314309
; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
@@ -380,7 +375,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
380375
; GCN-NEXT: s_mov_b32 s11, 0xf000
381376
; GCN-NEXT: s_mov_b32 s8, s10
382377
; GCN-NEXT: s_mov_b32 s9, s10
383-
; GCN-NEXT: s_waitcnt vmcnt(0)
384378
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
385379
; GCN-NEXT: s_waitcnt vmcnt(0)
386380
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -391,7 +385,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
391385
; GCN-NEXT: s_waitcnt vmcnt(0)
392386
; GCN-NEXT: .LBB4_4: ; %exit
393387
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
394-
; GCN-NEXT: s_waitcnt vmcnt(0)
395388
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
396389
; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
397390
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
@@ -451,7 +444,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
451444
; GCN-NEXT: s_mov_b32 s11, 0xf000
452445
; GCN-NEXT: s_mov_b32 s8, s10
453446
; GCN-NEXT: s_mov_b32 s9, s10
454-
; GCN-NEXT: s_waitcnt vmcnt(0)
455447
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
456448
; GCN-NEXT: s_waitcnt vmcnt(0)
457449
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -462,7 +454,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
462454
; GCN-NEXT: s_waitcnt vmcnt(0)
463455
; GCN-NEXT: .LBB5_4: ; %exit
464456
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
465-
; GCN-NEXT: s_waitcnt vmcnt(0)
466457
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
467458
; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
468459
; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
@@ -560,7 +551,6 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
560551
; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
561552
; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
562553
; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
563-
; GCN-NEXT: s_waitcnt vmcnt(0)
564554
; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
565555
; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
566556
; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]

llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6178,13 +6178,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
61786178
; NOOPT-NEXT: v_mov_b32_e32 v11, v14
61796179
; NOOPT-NEXT: v_mov_b32_e32 v12, v13
61806180
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
6181-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
6181+
; NOOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
61826182
; NOOPT-NEXT: ; implicit-def: $sgpr1
61836183
; NOOPT-NEXT: ; implicit-def: $sgpr1
61846184
; NOOPT-NEXT: ; implicit-def: $sgpr1
61856185
; NOOPT-NEXT: ; implicit-def: $sgpr1
61866186
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
6187-
; NOOPT-NEXT: s_waitcnt expcnt(0)
61886187
; NOOPT-NEXT: v_mov_b32_e32 v9, v4
61896188
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
61906189
; NOOPT-NEXT: v_mov_b32_e32 v11, v2
@@ -7297,7 +7296,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
72977296
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
72987297
; NOOPT-NEXT: s_waitcnt vmcnt(0)
72997298
; NOOPT-NEXT: ; implicit-def: $sgpr0
7300-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73017299
; NOOPT-NEXT: ;;#ASMSTART
73027300
; NOOPT-NEXT: ; reg use v[0:3]
73037301
; NOOPT-NEXT: ;;#ASMEND
@@ -7320,7 +7318,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
73207318
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
73217319
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73227320
; NOOPT-NEXT: ; implicit-def: $sgpr0
7323-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73247321
; NOOPT-NEXT: ;;#ASMSTART
73257322
; NOOPT-NEXT: ; reg use v[0:3]
73267323
; NOOPT-NEXT: ;;#ASMEND
@@ -7541,7 +7538,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75417538
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75427539
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75437540
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7544-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75457541
; NOOPT-NEXT: ;;#ASMSTART
75467542
; NOOPT-NEXT: ; reg use v[0:3]
75477543
; NOOPT-NEXT: ;;#ASMEND
@@ -7565,7 +7561,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75657561
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75667562
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75677563
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7568-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75697564
; NOOPT-NEXT: ;;#ASMSTART
75707565
; NOOPT-NEXT: ; reg use v[0:3]
75717566
; NOOPT-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
150150
; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
151151
; MUBUF-NEXT: s_waitcnt vmcnt(0)
152152
; MUBUF-NEXT: s_mov_b32 s33, s5
153-
; MUBUF-NEXT: s_waitcnt vmcnt(0)
154153
; MUBUF-NEXT: s_setpc_b64 s[30:31]
155154
;
156155
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
@@ -192,7 +191,6 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
192191
; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
193192
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
194193
; FLATSCR-NEXT: s_mov_b32 s33, s2
195-
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
196194
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
197195
entry:
198196
%pin.low = alloca i32, align 8192, addrspace(5)

0 commit comments

Comments
 (0)