@@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
5033
5033
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
5034
5034
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
5035
5035
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
5036
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
5036
5037
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
5037
5038
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
5038
5039
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
5059
5060
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
5060
5061
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
5061
5062
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
5062
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
5063
5063
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
5064
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
5065
5064
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
5066
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
5067
5065
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
5068
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
5069
5066
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
5070
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
5071
5067
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
5072
5068
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
5073
5069
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
11993
11989
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
11994
11990
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
11995
11991
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
11992
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
11996
11993
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
11997
11994
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
11998
11995
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
12019
12016
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
12020
12017
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
12021
12018
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
12022
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
12023
12019
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
12024
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
12025
12020
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
12026
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
12027
12021
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
12028
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
12029
12022
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
12030
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
12031
12023
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
12032
12024
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
12033
12025
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
18559
18551
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
18560
18552
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
18561
18553
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
18554
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
18562
18555
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
18563
18556
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
18564
18557
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
18596
18589
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
18597
18590
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
18598
18591
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
18599
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
18600
18592
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
18601
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
18602
18593
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
18603
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
18604
18594
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
18605
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
18606
18595
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
18607
18596
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
18608
18597
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
18701
18690
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
18702
18691
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
18703
18692
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
18704
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2 )
18693
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0 )
18705
18694
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
18706
18695
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
18707
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
18708
18696
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
18709
18697
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
18710
18698
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
24640
24628
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
24641
24629
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
24642
24630
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
24631
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
24643
24632
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
24644
24633
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
24645
24634
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
24677
24666
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
24678
24667
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
24679
24668
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
24680
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
24681
24669
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
24682
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
24683
24670
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
24684
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
24685
24671
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
24686
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
24687
24672
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
24688
24673
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
24689
24674
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
24782
24767
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
24783
24768
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
24784
24769
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
24785
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2 )
24770
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0 )
24786
24771
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
24787
24772
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
24788
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
24789
24773
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
24790
24774
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
24791
24775
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
28760
28744
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
28761
28745
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
28762
28746
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
28747
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
28763
28748
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
28764
28749
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
28765
28750
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
28792
28777
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
28793
28778
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
28794
28779
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
28795
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
28796
28780
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
28797
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
28798
28781
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
28799
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
28800
28782
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
28801
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
28802
28783
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
28803
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
28804
28784
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
28805
28785
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
28806
28786
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
32871
32851
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
32872
32852
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
32873
32853
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
32854
+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
32874
32855
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
32875
32856
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
32876
32857
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
32903
32884
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
32904
32885
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
32905
32886
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
32906
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
32907
32887
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
32908
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
32909
32888
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
32910
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
32911
32889
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
32912
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
32913
32890
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
32914
- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
32915
32891
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
32916
32892
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
32917
32893
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
0 commit comments