@@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
32683268; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
32693269; GFX7-NEXT: s_mov_b32 s2, -1
32703270; GFX7-NEXT: s_waitcnt vmcnt(1)
3271- ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
32723271; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8
3272+ ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
3273+ ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
32733274; GFX7-NEXT: s_waitcnt vmcnt(0)
3275+ ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
3276+ ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
32743277; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8
32753278; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
3276- ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8
3277- ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
3278- ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0
3279- ; GFX7-NEXT: v_mul_u32_u24_e32 v3, v6, v3
3279+ ; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5
32803280; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
32813281; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8
32823282; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
3283- ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3
3283+ ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1
32843284; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
32853285; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1
32863286; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
@@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33073307; GFX8-NEXT: v_mov_b32_e32 v0, s4
33083308; GFX8-NEXT: v_mov_b32_e32 v1, s5
33093309; GFX8-NEXT: s_waitcnt vmcnt(1)
3310- ; GFX8-NEXT: v_lshrrev_b32_e32 v8 , 8, v3
3310+ ; GFX8-NEXT: v_lshrrev_b32_e32 v7 , 8, v3
33113311; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
3312- ; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8
3313- ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
3312+ ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8
33143313; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
3315- ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
33163314; GFX8-NEXT: s_waitcnt vmcnt(0)
3317- ; GFX8-NEXT: v_lshrrev_b32_e32 v9 , 8, v2
3318- ; GFX8-NEXT: v_and_b32_e32 v7, 0xff , v2
3319- ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3315+ ; GFX8-NEXT: v_lshrrev_b32_e32 v8 , 8, v2
3316+ ; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3) , v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3317+ ; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8
33203318; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3321- ; GFX8-NEXT: v_mad_u16 v6, v6, v7, v8
3319+ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3
3320+ ; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6
3321+ ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
33223322; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6
33233323; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33243324; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4
@@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33373337; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
33383338; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
33393339; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
3340- ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6 , 8, v1
3340+ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5 , 8, v1
33413341; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
3342- ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7 , 8, v2
3342+ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6 , 8, v2
33433343; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3344- ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
3345- ; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2
3346- ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3347- ; GFX9-NODL-NEXT: v_and_b32_sdwa v8 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3344+ ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3345+ ; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8
3346+ ; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6
3347+ ; GFX9-NODL-NEXT: v_and_b32_sdwa v7 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
33483348; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
33493349; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
3350- ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v4 , v5, v6
3350+ ; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6 , v5, v4
33513351; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
3352- ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8 , v3, v4
3352+ ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7 , v3, v4
33533353; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33543354; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
33553355; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33673367; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
33683368; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
33693369; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3370- ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6 , 8, v1
3370+ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5 , 8, v1
33713371; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3372- ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7 , 8, v2
3372+ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6 , 8, v2
33733373; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
3374- ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
3375- ; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2
3376- ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3377- ; GFX9-DL-NEXT: v_and_b32_sdwa v8 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3374+ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
3375+ ; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8
3376+ ; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
3377+ ; GFX9-DL-NEXT: v_and_b32_sdwa v7 , v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
33783378; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
33793379; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
3380- ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v4 , v5, v6
3380+ ; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6 , v5, v4
33813381; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
3382- ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8 , v3, v4
3382+ ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7 , v3, v4
33833383; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
33843384; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
33853385; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
33923392; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
33933393; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
33943394; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
3395- ; GFX10-DL-NEXT: v_mov_b32_e32 v4 , 0xff
3395+ ; GFX10-DL-NEXT: v_mov_b32_e32 v6 , 0xff
33963396; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
33973397; GFX10-DL-NEXT: s_clause 0x1
33983398; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1]
33993399; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3]
34003400; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3401- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1
3401+ ; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
34023402; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3403- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
3404- ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3405- ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
3406- ; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2
3407- ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
3408- ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3
3403+ ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
3404+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
3405+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
3406+ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1
34093407; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
3410- ; GFX10-DL-NEXT: v_mul_lo_u16 v0, v3, v0
3411- ; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3412- ; GFX10-DL-NEXT: v_bfe_i32 v4, v5, 0, 8
3408+ ; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3
3409+ ; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8
3410+ ; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5
3411+ ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3412+ ; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8
34133413; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
34143414; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
3415- ; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7 , v0
3416- ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v4 , v0
3415+ ; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3 , v0
3416+ ; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6 , v0
34173417; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
34183418; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
34193419; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3429,34 +3429,32 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
34293429; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
34303430; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
34313431; GFX11-DL-TRUE16-NEXT: s_clause 0x1
3432- ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3 , v0, s[0:1]
3433- ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4 , v0, s[2:3]
3432+ ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2 , v0, s[0:1]
3433+ ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3 , v0, s[2:3]
34343434; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
3435- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3
3435+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2
3436+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8
34363437; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
3437- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
3438- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v3, 0, 8
3439- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
3440- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8
3441- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3442- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
3443- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
3444- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
3445- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
3438+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
3439+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
3440+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
3441+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
3442+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h
3443+ ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
3444+ ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l
34463445; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
3447- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
3448- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
3449- ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
3450- ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.h
3451- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
3452- ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4
3453- ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3454- ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l
3455- ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
3456- ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
3446+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
3447+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2
3448+ ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
3449+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
3450+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
3451+ ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
3452+ ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
3453+ ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
3454+ ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l
34573455; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3458- ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2 .l
3459- ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v4 .l, v0.l
3456+ ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4 .l
3457+ ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3 .l, v0.l
34603458; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
34613459; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
34623460; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
@@ -3475,25 +3473,24 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
34753473; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1]
34763474; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
34773475; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(1)
3478- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3476+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8
34793477; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(0)
3480- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0
3481- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
3482- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
3483- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v6, v1, 0, 8
3484- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
3485- ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
3486- ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v0
3487- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1
3488- ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0
3489- ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3490- ; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v3, v2
3478+ ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0
3479+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1
3480+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0
3481+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
3482+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
3483+ ; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v2, v3
34913484; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8
34923485; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
3493- ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
3494- ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v7, v2
3495- ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3486+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1
3487+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v5, v6, 0, 8
3488+ ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
3489+ ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0
34963490; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2
3491+ ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
3492+ ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3493+ ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v5, v2
34973494; GFX11-DL-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v2
34983495; GFX11-DL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
34993496; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
0 commit comments