@@ -858,10 +858,10 @@ define amdgpu_kernel void @select_v2f16(
858858; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
859859; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
860860; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
861- ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
862- ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
863861; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2
864862; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3
863+ ; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2
864+ ; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3
865865; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
866866; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12
867867; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13
@@ -871,8 +871,8 @@ define amdgpu_kernel void @select_v2f16(
871871; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0
872872; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14
873873; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15
874- ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[4:7 ], 0
875- ; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[24:27 ], 0
874+ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[24:27 ], 0
875+ ; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[4:7 ], 0
876876; GFX11-TRUE16-NEXT: s_mov_b32 s1, s9
877877; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
878878; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
@@ -881,15 +881,15 @@ define amdgpu_kernel void @select_v2f16(
881881; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
882882; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
883883; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
884- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
885- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
884+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
886885; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l
887- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
888- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
889- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
890- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
886+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
887+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v2.l, vcc_lo
888+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
889+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
890+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
891+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
891892; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8
892- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
893893; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
894894; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
895895; GFX11-TRUE16-NEXT: s_endpgm
@@ -1067,15 +1067,15 @@ define amdgpu_kernel void @select_v2f16_imm_a(
10671067; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
10681068; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
10691069; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1070- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1071- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1070+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
10721071; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l
1072+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
10731073; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
1074- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1075- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
1074+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
10761075; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
1076+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1077+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
10771078; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1078- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
10791079; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
10801080; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
10811081; GFX11-TRUE16-NEXT: s_endpgm
@@ -1246,15 +1246,15 @@ define amdgpu_kernel void @select_v2f16_imm_b(
12461246; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
12471247; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
12481248; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1249- ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1250- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
1249+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
12511250; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l
1251+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
12521252; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
1253- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1254- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0
1253+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
12551254; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
1255+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1256+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
12561257; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1257- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
12581258; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
12591259; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
12601260; GFX11-TRUE16-NEXT: s_endpgm
@@ -1402,42 +1402,42 @@ define amdgpu_kernel void @select_v2f16_imm_c(
14021402;
14031403; GFX11-TRUE16-LABEL: select_v2f16_imm_c:
14041404; GFX11-TRUE16: ; %bb.0: ; %entry
1405- ; GFX11-TRUE16-NEXT: s_load_b256 s[4:11 ], s[4:5], 0x24
1406- ; GFX11-TRUE16-NEXT: s_mov_b32 s2 , -1
1407- ; GFX11-TRUE16-NEXT: s_mov_b32 s3 , 0x31016000
1408- ; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2
1409- ; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
1410- ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
1411- ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
1412- ; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
1413- ; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
1405+ ; GFX11-TRUE16-NEXT: s_load_b256 s[0:7 ], s[4:5], 0x24
1406+ ; GFX11-TRUE16-NEXT: s_mov_b32 s10 , -1
1407+ ; GFX11-TRUE16-NEXT: s_mov_b32 s11 , 0x31016000
1408+ ; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10
1409+ ; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11
1410+ ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
1411+ ; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
1412+ ; GFX11-TRUE16-NEXT: s_mov_b32 s22, s10
1413+ ; GFX11-TRUE16-NEXT: s_mov_b32 s23, s11
14141414; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
1415- ; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
1416- ; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
1417- ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
1418- ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
1415+ ; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4
1416+ ; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5
1417+ ; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
1418+ ; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
14191419; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
14201420; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1421- ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10
1422- ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11
1423- ; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
1421+ ; GFX11-TRUE16-NEXT: s_mov_b32 s20, s6
1422+ ; GFX11-TRUE16-NEXT: s_mov_b32 s21, s7
1423+ ; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
14241424; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0
1425+ ; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
14251426; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
14261427; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
14271428; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
1428- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
14291429; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
1430+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
14301431; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1431- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1432- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1433- ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l
14341432; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
1435- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0
1436- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1433+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1434+ ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v3.l
1435+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2
14371436; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
1438- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1437+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1438+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, vcc_lo
14391439; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
1440- ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3 ], 0
1440+ ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11 ], 0
14411441; GFX11-TRUE16-NEXT: s_endpgm
14421442;
14431443; GFX11-FAKE16-LABEL: select_v2f16_imm_c:
@@ -1590,34 +1590,32 @@ define amdgpu_kernel void @select_v2f16_imm_d(
15901590; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3
15911591; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2
15921592; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3
1593- ; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2
1594- ; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3
15951593; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
15961594; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8
15971595; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9
15981596; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6
15991597; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7
16001598; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0
16011599; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1602- ; GFX11-TRUE16-NEXT: s_mov_b32 s20 , s10
1603- ; GFX11-TRUE16-NEXT: s_mov_b32 s21 , s11
1600+ ; GFX11-TRUE16-NEXT: s_mov_b32 s12 , s10
1601+ ; GFX11-TRUE16-NEXT: s_mov_b32 s13 , s11
16041602; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5
1605- ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23 ], 0
1603+ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[12:15 ], 0
16061604; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
16071605; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
16081606; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
16091607; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
16101608; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
16111609; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1612- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0 , 16, v2
1610+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1 , 16, v2
16131611; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
16141612; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l
1615- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v1 .l, 0x3800, v2.l, vcc_lo
1616- ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0 .l, 0x3900, v0 .l, s0
1613+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0 .l, 0x3800, v2.l, vcc_lo
1614+ ; GFX11-TRUE16-NEXT: v_cndmask_b16 v1 .l, 0x3900, v1 .l, s0
16171615; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1618- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1 , 0xffff, v1
1616+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0 , 0xffff, v0
16191617; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4
1620- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0 , 16, v1
1618+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1 , 16, v0
16211619; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
16221620; GFX11-TRUE16-NEXT: s_endpgm
16231621;
0 commit comments