@@ -9418,78 +9418,80 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
94189418; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
94199419; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
94209420; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9421- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2 )
9421+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1 )
94229422; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
9423- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9424- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
94259423; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
9424+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
94269425; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
9427- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9428- ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
9429- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9430- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9431- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
9432- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9433- ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
9434- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9426+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
9427+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
94359428; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
94369429; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
9437- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9438- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
9439- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9430+ ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9431+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9432+ ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
9433+ ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
9434+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9435+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9436+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9437+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
9438+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9439+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9440+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9441+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9442+ ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
94409443; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
94419444; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9442- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
9443- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
9444- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
9445+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9446+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
9447+ ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
9448+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9449+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
9450+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9451+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9452+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
9453+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
9454+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
9455+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9456+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
94459457; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
9446- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v2
9447- ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9448- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
9449- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
9450- ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
9451- ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v3, 16, 1
9452- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v3
9453- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
9454- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9455- ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
9456- ; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v3, 0x7fff
9457- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9458- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_cndmask_b32 v1, v7, v9
9459- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9460- ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
9458+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
9459+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
9460+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
9461+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
94619462; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
9462- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9463- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9464- ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
9465- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
9466- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
9463+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
9464+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
94679465; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
9468- ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
9469- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
9470- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9471- ; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
9472- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9473- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
9474- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v5
9475- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9476- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v4
9477- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v13, vcc_lo
9466+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9467+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9468+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
9469+ ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v8, 0x7fff
9470+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
94789471; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9479- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
9480- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
9481- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v14, vcc_lo
9472+ ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
9473+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
9474+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
9475+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
94829476; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9483- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
9484- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v12, v15, vcc_lo
9485- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9486- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9487- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
9488- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9489- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v7
9490- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9491- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
9492- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v6
9477+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
9478+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9479+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9480+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
9481+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
9482+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9483+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v8
9484+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo
9485+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
9486+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
9487+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
9488+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
9489+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h
9490+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9491+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
9492+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4
9493+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9494+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5
94939495; GFX11-TRUE16-NEXT: .LBB47_2: ; %end
94949496; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
94959497; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
0 commit comments