@@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
50335033; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
50345034; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
50355035; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
5036+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
50365037; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
50375038; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
50385039; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
50595060; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
50605061; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
50615062; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
5062- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
50635063; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
5064- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
50655064; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
5066- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
50675065; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
5068- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
50695066; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
5070- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
50715067; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
50725068; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
50735069; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
1199311989; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
1199411990; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
1199511991; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
11992+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1199611993; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
1199711994; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
1199811995; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
1201912016; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
1202012017; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1202112018; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
12022- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
1202312019; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
12024- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
1202512020; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
12026- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
1202712021; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
12028- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1202912022; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
12030- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1203112023; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
1203212024; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
1203312025; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1855918551; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
1856018552; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
1856118553; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
18554+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1856218555; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
1856318556; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
1856418557; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1859618589; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
1859718590; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1859818591; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
18599- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
1860018592; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
18601- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
1860218593; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
18603- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
1860418594; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
18605- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
1860618595; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
1860718596; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
1860818597; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
1870118690; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1870218691; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
1870318692; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
18704- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2 )
18693+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0 )
1870518694; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
1870618695; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
18707- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1870818696; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
1870918697; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
1871018698; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2464024628; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
2464124629; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
2464224630; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
24631+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2464324632; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
2464424633; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
2464524634; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2467724666; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
2467824667; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
2467924668; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
24680- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
2468124669; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
24682- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
2468324670; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
24684- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
2468524671; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
24686- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2468724672; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
2468824673; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2468924674; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
2478224767; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
2478324768; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
2478424769; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
24785- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2 )
24770+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0 )
2478624771; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
2478724772; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
24788- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2478924773; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
2479024774; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
2479124775; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
2876028744; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
2876128745; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
2876228746; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
28747+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2876328748; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
2876428749; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
2876528750; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
2879228777; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
2879328778; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
2879428779; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
28795- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
2879628780; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
28797- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
2879828781; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
28799- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
2880028782; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
28801- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
2880228783; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
28803- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
2880428784; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
2880528785; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
2880628786; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
3287132851; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
3287232852; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
3287332853; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
32854+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
3287432855; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
3287532856; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
3287632857; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
3290332884; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
3290432885; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
3290532886; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
32906- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
3290732887; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
32908- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
3290932888; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
32910- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
3291132889; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
32912- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
3291332890; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
32914- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
3291532891; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
3291632892; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
3291732893; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
0 commit comments