Skip to content

Commit 7c75502

Browse files
committed
fix an error
1 parent 231edfd commit 7c75502

File tree

8 files changed

+126
-64
lines changed

8 files changed

+126
-64
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7745,7 +7745,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77457745
: &AMDGPU::VGPR_32RegClass);
77467746
auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
77477747
.addImm(0) // src0_modifiers
7748-
.add(Inst.getOperand(1))
7748+
.add(Inst.getOperand(2))
77497749
.addImm(0) // clamp
77507750
.addImm(0); // omod
77517751
if (ST.useRealTrue16Insts())

llvm/test/CodeGen/AMDGPU/frem.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -626,14 +626,13 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
626626
; GFX1200-TRUE16-NEXT: s_clause 0x1
627627
; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
628628
; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
629-
; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
630-
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
629+
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
630+
; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
631631
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
632632
; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
633633
; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
634-
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
634+
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
635635
; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
636-
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
637636
; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
638637
; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
639638
; GFX1200-TRUE16-NEXT: s_endpgm
@@ -648,14 +647,13 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
648647
; GFX1200-FAKE16-NEXT: s_clause 0x1
649648
; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
650649
; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
651-
; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
652-
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
650+
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
651+
; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
653652
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
654653
; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
655654
; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
656-
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
655+
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
657656
; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
658-
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
659657
; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
660658
; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
661659
; GFX1200-FAKE16-NEXT: s_endpgm
@@ -876,14 +874,13 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
876874
; GFX1200-TRUE16-NEXT: s_clause 0x1
877875
; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
878876
; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
879-
; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
880-
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
877+
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
878+
; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
881879
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
882880
; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
883881
; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
884-
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
882+
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
885883
; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
886-
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
887884
; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
888885
; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
889886
; GFX1200-TRUE16-NEXT: s_endpgm
@@ -898,14 +895,13 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
898895
; GFX1200-FAKE16-NEXT: s_clause 0x1
899896
; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
900897
; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
901-
; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
902-
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
898+
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
899+
; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
903900
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
904901
; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
905902
; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
906-
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
903+
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
907904
; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
908-
; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
909905
; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
910906
; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
911907
; GFX1200-FAKE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,37 @@ define amdgpu_kernel void @rcp_f16(
6565
; GFX12-TRUE16-LABEL: rcp_f16:
6666
; GFX12-TRUE16: ; %bb.0: ; %entry
6767
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68-
; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, 0
68+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
69+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
6972
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
70-
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
71-
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
72-
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
73+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
74+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
75+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
76+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
77+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
78+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
79+
; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
80+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
7381
; GFX12-TRUE16-NEXT: s_endpgm
7482
;
7583
; GFX12-FAKE16-LABEL: rcp_f16:
7684
; GFX12-FAKE16: ; %bb.0: ; %entry
7785
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
78-
; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, 0
86+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
87+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
88+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
89+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
7990
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
80-
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
81-
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
82-
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
91+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
92+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
93+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
94+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
95+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
96+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
97+
; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
98+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
8399
; GFX12-FAKE16-NEXT: s_endpgm
84100
ptr addrspace(1) %r,
85101
ptr addrspace(1) %a) {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,37 @@ define amdgpu_kernel void @rsq_f16(
6565
; GFX12-TRUE16-LABEL: rsq_f16:
6666
; GFX12-TRUE16: ; %bb.0: ; %entry
6767
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68-
; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, 0
68+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
69+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
6972
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
70-
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
71-
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
72-
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
73+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
74+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
75+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
76+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
77+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
78+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
79+
; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
80+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
7381
; GFX12-TRUE16-NEXT: s_endpgm
7482
;
7583
; GFX12-FAKE16-LABEL: rsq_f16:
7684
; GFX12-FAKE16: ; %bb.0: ; %entry
7785
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
78-
; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, 0
86+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
87+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
88+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
89+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
7990
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
80-
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
81-
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
82-
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
91+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
92+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
93+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
94+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
95+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
96+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
97+
; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
98+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
8399
; GFX12-FAKE16-NEXT: s_endpgm
84100
ptr addrspace(1) %r,
85101
ptr addrspace(1) %a) {

llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,37 @@ define amdgpu_kernel void @sqrt_f16(
8787
; GFX12-TRUE16-LABEL: sqrt_f16:
8888
; GFX12-TRUE16: ; %bb.0: ; %entry
8989
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90-
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, 0
90+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
91+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
92+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
93+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
9194
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
92-
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
93-
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
94-
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
95+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
96+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
97+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
98+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
99+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
100+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
101+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
102+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
95103
; GFX12-TRUE16-NEXT: s_endpgm
96104
;
97105
; GFX12-FAKE16-LABEL: sqrt_f16:
98106
; GFX12-FAKE16: ; %bb.0: ; %entry
99107
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
100-
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, 0
108+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
109+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
110+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
111+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
101112
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
102-
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
103-
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
104-
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
113+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
114+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
115+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
116+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
117+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
118+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
119+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
120+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
105121
; GFX12-FAKE16-NEXT: s_endpgm
106122
ptr addrspace(1) %r,
107123
ptr addrspace(1) %a) {
@@ -215,27 +231,45 @@ define amdgpu_kernel void @sqrt_v2f16(
215231
; GFX12-TRUE16-LABEL: sqrt_v2f16:
216232
; GFX12-TRUE16: ; %bb.0: ; %entry
217233
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
218-
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, 0
219-
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, 0
234+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
235+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
236+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
237+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
220238
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
221-
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
222-
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
223-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
239+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
240+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
241+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
242+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
243+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
244+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
245+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
246+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
247+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
248+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, v1.l
224249
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
225-
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
250+
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
226251
; GFX12-TRUE16-NEXT: s_endpgm
227252
;
228253
; GFX12-FAKE16-LABEL: sqrt_v2f16:
229254
; GFX12-FAKE16: ; %bb.0: ; %entry
230255
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
231-
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, 0
232-
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, 0
256+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
257+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
258+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
259+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
233260
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
234-
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
235-
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
236-
; GFX12-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
261+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
262+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
263+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
264+
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
265+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
266+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
267+
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
268+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
269+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
270+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
237271
; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
238-
; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
272+
; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
239273
; GFX12-FAKE16-NEXT: s_endpgm
240274
ptr addrspace(1) %r,
241275
ptr addrspace(1) %a) {

llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
1111
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
1212
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
1313
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
14-
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, 0, 0, 0, implicit $mode, implicit $exec
14+
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
1515
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
1616
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
1717
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -32,7 +32,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
3232
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
3333
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
3434
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
35-
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, 0, 0, 0, implicit $mode, implicit $exec
35+
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
3636
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
3737
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
3838
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -53,7 +53,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
5353
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
5454
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
5555
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
56-
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, 0, 0, 0, implicit $mode, implicit $exec
56+
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
5757
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
5858
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
5959
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -74,7 +74,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
7474
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
7575
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
7676
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
77-
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, 0, 0, 0, implicit $mode, implicit $exec
77+
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
7878
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
7979
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
8080
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
@@ -95,7 +95,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
9595
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
9696
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
9797
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
98-
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, 0, 0, 0, implicit $mode, implicit $exec
98+
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
9999
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
100100
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
101101
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)

0 commit comments

Comments
 (0)