@@ -38819,16 +38819,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3881938819; GFX11TRUE16-LABEL: s_select_v2bf16:
3882038820; GFX11TRUE16: ; %bb.0:
3882138821; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
38822- ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
3882338822; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
38824- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38825- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
38826- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38827- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38828- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38829- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
38830- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38831- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38823+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
38824+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
38825+ ; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38826+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38827+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
38828+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38829+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
3883238830; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3883338831; GFX11TRUE16-NEXT: ; return to shader part epilog
3883438832;
@@ -38936,19 +38934,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3893638934;
3893738935; GFX11TRUE16-LABEL: s_vselect_v2bf16:
3893838936; GFX11TRUE16: ; %bb.0:
38939- ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
38940- ; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
38937+ ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
3894138938; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3894238939; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3894338940; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
38944- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
38945- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
38946- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
38947- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
38948- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
38949- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
38950- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
38951- ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
38941+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
38942+ ; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
38943+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
38944+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
38945+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
38946+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
38947+ ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
3895238948; GFX11TRUE16-NEXT: ; return to shader part epilog
3895338949;
3895438950; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40655,30 +40651,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
4065540651;
4065640652; GFX11TRUE16-LABEL: s_vselect_v4bf16:
4065740653; GFX11TRUE16: ; %bb.0:
40658- ; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
40654+ ; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
40655+ ; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
4065940656; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4066040657; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
40661- ; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
40662- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40663- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
40664- ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
40665- ; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
4066640658; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
4066740659; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
40668- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
40669- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
40670- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
40671- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
40672- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
40673- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
40674- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
40675- ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
40676- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
40677- ; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
40678- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
40660+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
40661+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
40662+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
40663+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
40664+ ; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
40665+ ; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
40666+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
40667+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
40668+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
40669+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
4067940670; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
40680- ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
40681- ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
40671+ ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
40672+ ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
4068240673; GFX11TRUE16-NEXT: ; return to shader part epilog
4068340674;
4068440675; GFX11FAKE16-LABEL: s_vselect_v4bf16:
0 commit comments