@@ -593,14 +593,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
593593; FLATSCR-NEXT: s_waitcnt vmcnt(0)
594594; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4
595595; FLATSCR-NEXT: s_waitcnt vmcnt(0)
596- ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2
597- ; FLATSCR-NEXT: scratch_load_ushort v3, off, s0
598- ; FLATSCR-NEXT: s_waitcnt vmcnt(1)
599- ; FLATSCR-NEXT: v_mov_b32_e32 v1, v0
596+ ; FLATSCR-NEXT: scratch_load_dword v0, off, s0
597+ ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
598+ ; FLATSCR-NEXT: v_lshrrev_b32_e32 v1, 16, v0
600599; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4
601- ; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100
602- ; FLATSCR-NEXT: s_waitcnt vmcnt(1)
603- ; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0
604600; FLATSCR-NEXT: s_waitcnt vmcnt(0)
605601; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
606602; FLATSCR-NEXT: s_endpgm
@@ -660,13 +656,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
660656; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
661657; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4
662658; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
663- ; FLATSCR_GFX10-NEXT: s_clause 0x1
664- ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2
665- ; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0
666- ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1)
667- ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0
659+ ; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0
668660; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
669- ; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
661+ ; FLATSCR_GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
670662; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4
671663; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
672664; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
@@ -689,12 +681,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
689681; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
690682; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
691683; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
692- ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v3 , off, off offset:2
684+ ; GFX11-TRUE16-NEXT: scratch_load_b32 v0 , off, off
693685; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
694- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
695- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
696- ; GFX11-TRUE16-NEXT: s_clause 0x1
697- ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off
686+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
698687; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
699688; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
700689; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
@@ -717,13 +706,9 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
717706; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
718707; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
719708; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
720- ; GFX11-FAKE16-NEXT: s_clause 0x1
721- ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
722- ; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
723- ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
724- ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
709+ ; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, off
725710; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
726- ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
711+ ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
727712; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
728713; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
729714; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
0 commit comments