@@ -48,15 +48,25 @@ define <2 x half> @chain_hi_to_lo_private() {
4848; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
4949; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
5050;
51- ; GFX11-LABEL: chain_hi_to_lo_private:
52- ; GFX11: ; %bb.0: ; %bb
53- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54- ; GFX11-NEXT: s_mov_b32 s0, 2
55- ; GFX11-NEXT: scratch_load_u16 v0, off, s0
56- ; GFX11-NEXT: s_mov_b32 s0, 0
57- ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58- ; GFX11-NEXT: s_waitcnt vmcnt(0)
59- ; GFX11-NEXT: s_setpc_b64 s[30:31]
51+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private:
52+ ; GFX11-TRUE16: ; %bb.0: ; %bb
53+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54+ ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2
55+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0
56+ ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
57+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
58+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
59+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
60+ ;
61+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private:
62+ ; GFX11-FAKE16: ; %bb.0: ; %bb
63+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64+ ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2
65+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0
66+ ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
67+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
68+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
69+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
6070bb:
6171 %gep_lo = getelementptr inbounds half , ptr addrspace (5 ) null , i64 1
6272 %load_lo = load half , ptr addrspace (5 ) %gep_lo
@@ -104,13 +114,21 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
104114; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
105115; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
106116;
107- ; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
108- ; GFX11: ; %bb.0: ; %bb
109- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110- ; GFX11-NEXT: scratch_load_u16 v0, v0, off
111- ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
112- ; GFX11-NEXT: s_waitcnt vmcnt(0)
113- ; GFX11-NEXT: s_setpc_b64 s[30:31]
117+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases:
118+ ; GFX11-TRUE16: ; %bb.0: ; %bb
119+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off
121+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
122+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
123+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
124+ ;
125+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
126+ ; GFX11-FAKE16: ; %bb.0: ; %bb
127+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off
129+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
130+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
131+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
114132bb:
115133 %load_lo = load half , ptr addrspace (5 ) %base_lo
116134 %load_hi = load half , ptr addrspace (5 ) %base_hi
@@ -288,17 +306,29 @@ define <2 x half> @chain_hi_to_lo_global() {
288306; GFX10-NEXT: s_waitcnt vmcnt(0)
289307; GFX10-NEXT: s_setpc_b64 s[30:31]
290308;
291- ; GFX11-LABEL: chain_hi_to_lo_global:
292- ; GFX11: ; %bb.0: ; %bb
293- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294- ; GFX11-NEXT: v_mov_b32_e32 v0, 2
295- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
296- ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
297- ; GFX11-NEXT: v_mov_b32_e32 v1, 0
298- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
299- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
300- ; GFX11-NEXT: s_waitcnt vmcnt(0)
301- ; GFX11-NEXT: s_setpc_b64 s[30:31]
309+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
310+ ; GFX11-TRUE16: ; %bb.0: ; %bb
311+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
313+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
314+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
315+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
316+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
317+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
318+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
319+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
320+ ;
321+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
322+ ; GFX11-FAKE16: ; %bb.0: ; %bb
323+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
325+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
326+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
327+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
328+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
329+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
330+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
331+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
302332bb:
303333 %gep_lo = getelementptr inbounds half , ptr addrspace (1 ) null , i64 1
304334 %load_lo = load half , ptr addrspace (1 ) %gep_lo
@@ -328,13 +358,21 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
328358; GFX10-NEXT: s_waitcnt vmcnt(0)
329359; GFX10-NEXT: s_setpc_b64 s[30:31]
330360;
331- ; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
332- ; GFX11: ; %bb.0: ; %bb
333- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334- ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
335- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
336- ; GFX11-NEXT: s_waitcnt vmcnt(0)
337- ; GFX11-NEXT: s_setpc_b64 s[30:31]
361+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases:
362+ ; GFX11-TRUE16: ; %bb.0: ; %bb
363+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
365+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
366+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
367+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
368+ ;
369+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
370+ ; GFX11-FAKE16: ; %bb.0: ; %bb
371+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
373+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
374+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
375+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
338376bb:
339377 %load_lo = load half , ptr addrspace (1 ) %base_lo
340378 %load_hi = load half , ptr addrspace (1 ) %base_hi
@@ -587,34 +625,65 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
587625; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
588626; FLATSCR_GFX10-NEXT: s_endpgm
589627;
590- ; GFX11-LABEL: vload2_private:
591- ; GFX11: ; %bb.0: ; %entry
592- ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
593- ; GFX11-NEXT: v_mov_b32_e32 v2, 0
594- ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
595- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
596- ; GFX11-NEXT: s_waitcnt vmcnt(0)
597- ; GFX11-NEXT: scratch_store_b16 off, v0, off dlc
598- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
599- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
600- ; GFX11-NEXT: s_waitcnt vmcnt(0)
601- ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
602- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
603- ; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
604- ; GFX11-NEXT: s_waitcnt vmcnt(0)
605- ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
606- ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
607- ; GFX11-NEXT: s_clause 0x1
608- ; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2
609- ; GFX11-NEXT: scratch_load_u16 v3, off, off
610- ; GFX11-NEXT: s_waitcnt vmcnt(1)
611- ; GFX11-NEXT: v_mov_b32_e32 v1, v0
612- ; GFX11-NEXT: s_waitcnt vmcnt(0)
613- ; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
614- ; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
615- ; GFX11-NEXT: s_waitcnt vmcnt(0)
616- ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
617- ; GFX11-NEXT: s_endpgm
628+ ; GFX11-TRUE16-LABEL: vload2_private:
629+ ; GFX11-TRUE16: ; %bb.0: ; %entry
630+ ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
631+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
632+ ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
633+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1]
634+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
635+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
636+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
637+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2
638+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
639+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
640+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
641+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4
642+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
643+ ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
644+ ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
645+ ; GFX11-TRUE16-NEXT: s_clause 0x1
646+ ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off offset:2
647+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off
648+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
649+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
650+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
651+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
652+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
653+ ; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
654+ ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
655+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
656+ ; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
657+ ; GFX11-TRUE16-NEXT: s_endpgm
658+ ;
659+ ; GFX11-FAKE16-LABEL: vload2_private:
660+ ; GFX11-FAKE16: ; %bb.0: ; %entry
661+ ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
662+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
663+ ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
664+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1]
665+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
666+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off dlc
667+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
668+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
669+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
670+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
671+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
672+ ; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
673+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
674+ ; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
675+ ; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
676+ ; GFX11-FAKE16-NEXT: s_clause 0x1
677+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off offset:2
678+ ; GFX11-FAKE16-NEXT: scratch_load_u16 v3, off, off
679+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
680+ ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
681+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
682+ ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
683+ ; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
684+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
685+ ; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
686+ ; GFX11-FAKE16-NEXT: s_endpgm
618687entry:
619688 %loc = alloca [3 x i16 ], align 2 , addrspace (5 )
620689 %tmp = load i16 , ptr addrspace (1 ) %in , align 2
@@ -836,17 +905,30 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
836905; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
837906; GFX10-NEXT: s_setpc_b64 s[30:31]
838907;
839- ; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
840- ; GFX11: ; %bb.0: ; %bb
841- ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
842- ; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
843- ; GFX11-NEXT: s_waitcnt vmcnt(0)
844- ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
845- ; GFX11-NEXT: s_waitcnt vmcnt(0)
846- ; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
847- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
848- ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
849- ; GFX11-NEXT: s_setpc_b64 s[30:31]
908+ ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep:
909+ ; GFX11-TRUE16: ; %bb.0: ; %bb
910+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
911+ ; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc
912+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
913+ ; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
914+ ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
915+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
916+ ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
917+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
918+ ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
919+ ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
920+ ;
921+ ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
922+ ; GFX11-FAKE16: ; %bb.0: ; %bb
923+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924+ ; GFX11-FAKE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
925+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
926+ ; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
927+ ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
928+ ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
929+ ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
930+ ; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
931+ ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
850932bb:
851933 %gep_lo = getelementptr inbounds i16 , ptr addrspace (1 ) %ptr , i64 1
852934 %load_lo = load volatile i16 , ptr addrspace (1 ) %gep_lo
0 commit comments