Skip to content

Commit c8e4efe

Browse files
authored
[AMDGPU][SIInsertWaitCnts] skip meta instructions early (llvm#145720) (llvm#3897)
2 parents 5030fc1 + 8b45901 commit c8e4efe

11 files changed

+387
-225
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,8 +1616,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16161616
bool FlushVmCnt) {
16171617
setForceEmitWaitcnt();
16181618

1619-
if (MI.isMetaInstruction())
1620-
return false;
1619+
assert(!MI.isMetaInstruction());
16211620

16221621
AMDGPU::Waitcnt Wait;
16231622

@@ -2181,6 +2180,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
21812180
E = Block.instr_end();
21822181
Iter != E;) {
21832182
MachineInstr &Inst = *Iter;
2183+
if (Inst.isMetaInstruction()) {
2184+
++Iter;
2185+
continue;
2186+
}
21842187

21852188
// Track pre-existing waitcnts that were added in earlier iterations or by
21862189
// the memory legalizer.

llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,7 +2292,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
22922292
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
22932293
; GFX9-SDAG-NEXT: s_mov_b32 s34, s12
22942294
; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
2295-
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
22962295
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
22972296
;
22982297
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2359,7 +2358,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
23592358
; GFX9-GISEL-NEXT: s_mov_b32 s32, s34
23602359
; GFX9-GISEL-NEXT: s_mov_b32 s34, s12
23612360
; GFX9-GISEL-NEXT: s_mov_b32 s33, s11
2362-
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
23632361
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
23642362
;
23652363
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
620620
; GFX9-NEXT: s_waitcnt vmcnt(0)
621621
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
622622
; GFX9-NEXT: .LBB3_4: ; %exit
623-
; GFX9-NEXT: s_waitcnt vmcnt(0)
624623
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
625624
; GFX9-NEXT: s_movk_i32 s4, 0x8000
626625
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -822,7 +821,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
822821
; GFX9-NEXT: s_waitcnt vmcnt(0)
823822
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
824823
; GFX9-NEXT: .LBB4_4: ; %exit
825-
; GFX9-NEXT: s_waitcnt vmcnt(0)
826824
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
827825
; GFX9-NEXT: s_movk_i32 s4, 0x8000
828826
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -1029,7 +1027,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
10291027
; GFX9-NEXT: .LBB5_4: ; %exit
10301028
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
10311029
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1032-
; GFX9-NEXT: s_waitcnt vmcnt(0)
10331030
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
10341031
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
10351032
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1355,7 +1352,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
13551352
; GFX9-NEXT: s_movk_i32 s34, 0x3800
13561353
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
13571354
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1358-
; GFX9-NEXT: s_waitcnt vmcnt(0)
13591355
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7
13601356
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
13611357
; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
@@ -1647,7 +1643,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
16471643
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
16481644
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
16491645
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
1650-
; GFX9-NEXT: s_waitcnt vmcnt(0)
16511646
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
16521647
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
16531648
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/extract-subvector.ll

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
128128
; GCN-NEXT: s_mov_b32 s11, 0xf000
129129
; GCN-NEXT: s_mov_b32 s8, s10
130130
; GCN-NEXT: s_mov_b32 s9, s10
131-
; GCN-NEXT: s_waitcnt vmcnt(0)
132131
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
133132
; GCN-NEXT: s_waitcnt vmcnt(0)
134133
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -139,7 +138,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
139138
; GCN-NEXT: s_waitcnt vmcnt(0)
140139
; GCN-NEXT: .LBB1_4: ; %exit
141140
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
142-
; GCN-NEXT: s_waitcnt vmcnt(0)
143141
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
144142
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
145143
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
@@ -199,7 +197,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
199197
; GCN-NEXT: s_mov_b32 s11, 0xf000
200198
; GCN-NEXT: s_mov_b32 s8, s10
201199
; GCN-NEXT: s_mov_b32 s9, s10
202-
; GCN-NEXT: s_waitcnt vmcnt(0)
203200
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
204201
; GCN-NEXT: s_waitcnt vmcnt(0)
205202
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -210,7 +207,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
210207
; GCN-NEXT: s_waitcnt vmcnt(0)
211208
; GCN-NEXT: .LBB2_4: ; %exit
212209
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
213-
; GCN-NEXT: s_waitcnt vmcnt(0)
214210
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
215211
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
216212
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
@@ -308,7 +304,6 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
308304
; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
309305
; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
310306
; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
311-
; GCN-NEXT: s_waitcnt vmcnt(0)
312307
; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
313308
; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
314309
; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
@@ -380,7 +375,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
380375
; GCN-NEXT: s_mov_b32 s11, 0xf000
381376
; GCN-NEXT: s_mov_b32 s8, s10
382377
; GCN-NEXT: s_mov_b32 s9, s10
383-
; GCN-NEXT: s_waitcnt vmcnt(0)
384378
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
385379
; GCN-NEXT: s_waitcnt vmcnt(0)
386380
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -391,7 +385,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
391385
; GCN-NEXT: s_waitcnt vmcnt(0)
392386
; GCN-NEXT: .LBB4_4: ; %exit
393387
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
394-
; GCN-NEXT: s_waitcnt vmcnt(0)
395388
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
396389
; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
397390
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
@@ -451,7 +444,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
451444
; GCN-NEXT: s_mov_b32 s11, 0xf000
452445
; GCN-NEXT: s_mov_b32 s8, s10
453446
; GCN-NEXT: s_mov_b32 s9, s10
454-
; GCN-NEXT: s_waitcnt vmcnt(0)
455447
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
456448
; GCN-NEXT: s_waitcnt vmcnt(0)
457449
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -462,7 +454,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
462454
; GCN-NEXT: s_waitcnt vmcnt(0)
463455
; GCN-NEXT: .LBB5_4: ; %exit
464456
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
465-
; GCN-NEXT: s_waitcnt vmcnt(0)
466457
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
467458
; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
468459
; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
@@ -560,7 +551,6 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
560551
; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
561552
; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
562553
; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
563-
; GCN-NEXT: s_waitcnt vmcnt(0)
564554
; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
565555
; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
566556
; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]

llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6178,13 +6178,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
61786178
; NOOPT-NEXT: v_mov_b32_e32 v11, v14
61796179
; NOOPT-NEXT: v_mov_b32_e32 v12, v13
61806180
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
6181-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
6181+
; NOOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
61826182
; NOOPT-NEXT: ; implicit-def: $sgpr1
61836183
; NOOPT-NEXT: ; implicit-def: $sgpr1
61846184
; NOOPT-NEXT: ; implicit-def: $sgpr1
61856185
; NOOPT-NEXT: ; implicit-def: $sgpr1
61866186
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
6187-
; NOOPT-NEXT: s_waitcnt expcnt(0)
61886187
; NOOPT-NEXT: v_mov_b32_e32 v9, v4
61896188
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
61906189
; NOOPT-NEXT: v_mov_b32_e32 v11, v2
@@ -7297,7 +7296,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
72977296
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
72987297
; NOOPT-NEXT: s_waitcnt vmcnt(0)
72997298
; NOOPT-NEXT: ; implicit-def: $sgpr0
7300-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73017299
; NOOPT-NEXT: ;;#ASMSTART
73027300
; NOOPT-NEXT: ; reg use v[0:3]
73037301
; NOOPT-NEXT: ;;#ASMEND
@@ -7320,7 +7318,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
73207318
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
73217319
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73227320
; NOOPT-NEXT: ; implicit-def: $sgpr0
7323-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
73247321
; NOOPT-NEXT: ;;#ASMSTART
73257322
; NOOPT-NEXT: ; reg use v[0:3]
73267323
; NOOPT-NEXT: ;;#ASMEND
@@ -7541,7 +7538,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75417538
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75427539
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75437540
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7544-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75457541
; NOOPT-NEXT: ;;#ASMSTART
75467542
; NOOPT-NEXT: ; reg use v[0:3]
75477543
; NOOPT-NEXT: ;;#ASMEND
@@ -7565,7 +7561,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
75657561
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75667562
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
75677563
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
7568-
; NOOPT-NEXT: s_waitcnt vmcnt(0)
75697564
; NOOPT-NEXT: ;;#ASMSTART
75707565
; NOOPT-NEXT: ; reg use v[0:3]
75717566
; NOOPT-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
150150
; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
151151
; MUBUF-NEXT: s_waitcnt vmcnt(0)
152152
; MUBUF-NEXT: s_mov_b32 s33, s5
153-
; MUBUF-NEXT: s_waitcnt vmcnt(0)
154153
; MUBUF-NEXT: s_setpc_b64 s[30:31]
155154
;
156155
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
@@ -192,7 +191,6 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
192191
; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
193192
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
194193
; FLATSCR-NEXT: s_mov_b32 s33, s2
195-
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
196194
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
197195
entry:
198196
%pin.low = alloca i32, align 8192, addrspace(5)

0 commit comments

Comments
 (0)