@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
7171; GFX10: ; %bb.0:
7272; GFX10-NEXT: v_mov_b32_e32 v0, 0
7373; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74- ; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
7574; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
7675; GFX10-NEXT: s_waitcnt vmcnt(0)
7776; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
7877; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
7978; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
80- ; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
81- ; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
79+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
80+ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
81+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
8282; GFX10-NEXT: v_mov_b32_e32 v0, 0
8383; GFX10-NEXT: v_mov_b32_e32 v1, 0
84- ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
84+ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
8585; GFX10-NEXT: global_store_short v[0:1], v2, off
8686; GFX10-NEXT: s_endpgm
8787;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
175175; GFX10: ; %bb.0:
176176; GFX10-NEXT: global_load_ushort v0, v[0:1], off
177177; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
178- ; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
179178; GFX10-NEXT: s_waitcnt vmcnt(0)
180179; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
181180; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
182181; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
183- ; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
184- ; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
182+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
183+ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
184+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
185185; GFX10-NEXT: v_mov_b32_e32 v0, 0
186186; GFX10-NEXT: v_mov_b32_e32 v1, 0
187- ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
187+ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
188188; GFX10-NEXT: global_store_short v[0:1], v2, off
189189; GFX10-NEXT: s_endpgm
190190;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
277277; GFX10: ; %bb.0:
278278; GFX10-NEXT: v_mov_b32_e32 v1, 0
279279; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
280- ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
281280; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
282281; GFX10-NEXT: s_waitcnt vmcnt(0)
283282; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
284283; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
285284; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
286- ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287- ; GFX10-NEXT: v_cndmask_b32_e32 v4 , v1, v0, vcc_lo
285+ ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
286+ ; GFX10-NEXT: v_cndmask_b32_e32 v3 , v1, v0, vcc_lo
288287; GFX10-NEXT: v_mov_b32_e32 v0, 0
289288; GFX10-NEXT: v_mov_b32_e32 v1, 0
290- ; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
289+ ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
290+ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
291291; GFX10-NEXT: global_store_short v[0:1], v2, off
292292; GFX10-NEXT: s_endpgm
293293;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
383383; GFX10: ; %bb.0:
384384; GFX10-NEXT: v_mov_b32_e32 v1, 0
385385; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
386- ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
387386; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
388387; GFX10-NEXT: s_waitcnt vmcnt(0)
389388; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
390389; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
391390; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
392- ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
393- ; GFX10-NEXT: v_cndmask_b32_e64 v4 , v1, s4, vcc_lo
391+ ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
392+ ; GFX10-NEXT: v_cndmask_b32_e64 v3 , v1, s4, vcc_lo
394393; GFX10-NEXT: v_mov_b32_e32 v0, 0
395394; GFX10-NEXT: v_mov_b32_e32 v1, 0
396- ; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
395+ ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
396+ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
397397; GFX10-NEXT: global_store_short v[0:1], v2, off
398398; GFX10-NEXT: s_endpgm
399399;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
487487; GFX10: ; %bb.0:
488488; GFX10-NEXT: v_mov_b32_e32 v2, 0
489489; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
490- ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
491490; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
492491; GFX10-NEXT: s_waitcnt vmcnt(0)
493492; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
494493; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
495494; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
496- ; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
495+ ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
497496; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
498497; GFX10-NEXT: v_mov_b32_e32 v0, 0
499498; GFX10-NEXT: v_mov_b32_e32 v1, 0
499+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
500500; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501501; GFX10-NEXT: global_store_short v[0:1], v2, off
502502; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
590590; GFX10: ; %bb.0:
591591; GFX10-NEXT: global_load_ushort v0, v[0:1], off
592592; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
593- ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
594593; GFX10-NEXT: s_waitcnt vmcnt(0)
595594; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
596595; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
597596; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
598- ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
597+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
599598; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
599+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
600600; GFX10-NEXT: v_mov_b32_e32 v0, 0
601601; GFX10-NEXT: v_mov_b32_e32 v1, 0
602602; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
689689; GFX10: ; %bb.0:
690690; GFX10-NEXT: global_load_ushort v0, v[0:1], off
691691; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
692- ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
693692; GFX10-NEXT: s_waitcnt vmcnt(0)
694693; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
695694; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
696695; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
697- ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
696+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
698697; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
698+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
699699; GFX10-NEXT: v_mov_b32_e32 v0, 0
700700; GFX10-NEXT: v_mov_b32_e32 v1, 0
701701; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
788788; GFX10: ; %bb.0:
789789; GFX10-NEXT: global_load_ushort v0, v[0:1], off
790790; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
791- ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
792791; GFX10-NEXT: s_waitcnt vmcnt(0)
793792; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
794793; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
795794; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
796- ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
795+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
797796; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
797+ ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
798798; GFX10-NEXT: v_mov_b32_e32 v0, 0
799799; GFX10-NEXT: v_mov_b32_e32 v1, 0
800800; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
0 commit comments