@@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
145145; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
146146; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
147147; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
148- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
149- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
150- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
151- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
148+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
149+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
150+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
151+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
152+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
153+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
152154; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
153- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
155+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
156+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
154157; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
155158; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
159+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
160+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
156161; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
157162; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
158- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
163+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
159164; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
165+ ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
166+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
160167; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
168+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
161169; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
162170; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
163- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
164- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
165171; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
166- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
167- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
168- ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
169- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
170- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
171- ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
172- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
173172; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
174173; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
175174; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
175+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
176176; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
177177; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
178- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
179178; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
180179; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
181180; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
797796; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
798797; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2
799798; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
800- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
801- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
802- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
803- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
804- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
799+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
800+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
801+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
805802; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
806- ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
807- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
808- ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
809- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
810- ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
811- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
812- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
813- ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
814- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
815- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
816- ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
803+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
804+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
805+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
806+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
807+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
808+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
809+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
810+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
817811; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
818812; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
819- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
820- ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
821- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
822- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
823- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
824- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
825- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
813+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
814+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
815+ ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
816+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
817+ ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
818+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
819+ ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
820+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
826821; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
822+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
823+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
824+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
825+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
826+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
827+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
828+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
827829; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
828- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
829830; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
830- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
831- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
832- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
833- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
831+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
832+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
834833; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
835834; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
836835; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
0 commit comments