@@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
32683268; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
32693269; GFX7-NEXT: s_mov_b32 s2, -1
32703270; GFX7-NEXT: s_waitcnt vmcnt(1)
3271- ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
32723271; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
3273- ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
3272+ ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
32743273; GFX7-NEXT: s_waitcnt vmcnt(0)
3275- ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
3276- ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
32773274; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
32783275; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
3279- ; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5
3276+ ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
3277+ ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
3278+ ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
3279+ ; GFX7-NEXT: v_mul_u32_u24_e32 v3, v6, v3
32803280; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
32813281; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
32823282; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
3283- ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
3283+ ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
32843284; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
32853285; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
32863286; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33073307; GFX8-NEXT: v_mov_b32_e32 v0, s4
33083308; GFX8-NEXT: v_mov_b32_e32 v1, s5
33093309; GFX8-NEXT: s_waitcnt vmcnt(1)
3310- ; GFX8-NEXT: v_lshrrev_b32_e32 v7 , 8, v3
3310+ ; GFX8-NEXT: v_lshrrev_b32_e32 v8 , 8, v3
33113311; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
3312- ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8
3312+ ; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
3313+ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
33133314; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
3315+ ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
33143316; GFX8-NEXT: s_waitcnt vmcnt(0)
3315- ; GFX8-NEXT: v_lshrrev_b32_e32 v8 , 8, v2
3316- ; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3) , v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3317- ; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
3317+ ; GFX8-NEXT: v_lshrrev_b32_e32 v9 , 8, v2
3318+ ; GFX8-NEXT: v_and_b32_e32 v7, 0xff , v2
3319+ ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
33183320; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3319- ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
3320- ; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6
3321- ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
3321+ ; GFX8-NEXT: v_mad_u16 v6, v6, v7, v8
33223322; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6
33233323; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33243324; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
@@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33373337; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
33383338; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
33393339; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
3340- ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5 , 8, v1
3340+ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6 , 8, v1
33413341; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
3342- ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6 , 8, v2
3342+ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7 , 8, v2
33433343; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3344- ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3345- ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8
3346- ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6
3347- ; GFX9-NODL-NEXT: v_and_b32_sdwa v7 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3344+ ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
3345+ ; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2
3346+ ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3347+ ; GFX9-NODL-NEXT: v_and_b32_sdwa v8 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
33483348; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
33493349; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
3350- ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6 , v5, v4
3350+ ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v4 , v5, v6
33513351; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
3352- ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7 , v3, v4
3352+ ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8 , v3, v4
33533353; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33543354; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
33553355; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33673367; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
33683368; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
33693369; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3370- ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5 , 8, v1
3370+ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6 , 8, v1
33713371; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3372- ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6 , 8, v2
3372+ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7 , 8, v2
33733373; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3374- ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3375- ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8
3376- ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
3377- ; GFX9-DL-NEXT: v_and_b32_sdwa v7 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3374+ ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
3375+ ; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2
3376+ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3377+ ; GFX9-DL-NEXT: v_and_b32_sdwa v8 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
33783378; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
33793379; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
3380- ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6 , v5, v4
3380+ ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v4 , v5, v6
33813381; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
3382- ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7 , v3, v4
3382+ ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8 , v3, v4
33833383; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33843384; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
33853385; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33923392; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
33933393; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
33943394; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3395- ; GFX10-DL-NEXT: v_mov_b32_e32 v6 , 0xff
3395+ ; GFX10-DL-NEXT: v_mov_b32_e32 v4 , 0xff
33963396; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
33973397; GFX10-DL-NEXT: s_clause 0x1
33983398; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
33993399; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
34003400; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3401- ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
3401+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1
34023402; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3403- ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
3404- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
3405- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
3406- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
3403+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
3404+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3405+ ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
3406+ ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2
3407+ ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
3408+ ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3
34073409; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
3408- ; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3
3409- ; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8
3410- ; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5
3411- ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3412- ; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8
3410+ ; GFX10-DL-NEXT: v_mul_lo_u16 v0, v3, v0
3411+ ; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3412+ ; GFX10-DL-NEXT: v_bfe_i32 v4, v5, 0, 8
34133413; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
34143414; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
3415- ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3 , v0
3416- ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6 , v0
3415+ ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7 , v0
3416+ ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v4 , v0
34173417; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
34183418; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
34193419; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3429,32 +3429,34 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
34293429; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
34303430; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
34313431; GFX11-DL-TRUE16-NEXT: s_clause 0x1
3432- ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2 , v0, s[0:1]
3433- ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3 , v0, s[2:3]
3432+ ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3 , v0, s[0:1]
3433+ ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4 , v0, s[2:3]
34343434; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
3435- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2
3436- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8
3435+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3
34373436; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
3438- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
3439- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
3440- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
3441- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
3442- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h
3443- ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
3444- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l
3445- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
3446- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3447- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2
3448- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
3449- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
3437+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
3438+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v3, 0, 8
3439+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
3440+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8
3441+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3442+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
3443+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
3444+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
34503445; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
3451- ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
3452- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
3453- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
3454- ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l
3455- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3456- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3446+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
3447+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
3448+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
3449+ ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
3450+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.h
3451+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
3452+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4
3453+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3454+ ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l
3455+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
34573456; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
3457+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3458+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
3459+ ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v4.l, v0.l
34583460; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
34593461; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
34603462; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3473,24 +3475,25 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
34733475; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
34743476; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
34753477; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(1)
3476- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8
3478+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
34773479; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(0)
3478- ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0
3479- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1
3480- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0
3481- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
3482- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
3483- ; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v2, v3
3484- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
3485- ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
3480+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0
3481+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
3482+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
3483+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v6, v1, 0, 8
3484+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
3485+ ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
3486+ ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v0
34863487; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1
3487- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v5, v6, 0, 8
3488- ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
34893488; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0
3490- ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2
3489+ ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3490+ ; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v3, v2
3491+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
3492+ ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
34913493; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
3492- ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3493- ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v5, v2
3494+ ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v7, v2
3495+ ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3496+ ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2
34943497; GFX11-DL-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v2
34953498; GFX11-DL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
34963499; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
0 commit comments