Skip to content

Commit ba81903

Browse files
shiltiankosarev
andauthored
[gfx1250][SDAG] Lower unsafe bf16 divisions (#149628)
Co-authored-by: Kosarev, Ivan <[email protected]>
1 parent ce345cc commit ba81903

File tree

2 files changed

+307
-4
lines changed

2 files changed

+307
-4
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
618618
ISD::FSIN, ISD::FROUND},
619619
MVT::f16, Custom);
620620

621+
// BF16 - VOP1 Actions.
622+
if (Subtarget->hasBF16TransInsts())
623+
setOperationAction(ISD::FDIV, MVT::bf16, Custom);
624+
621625
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
622626
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
623627

@@ -11200,7 +11204,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
1120011204
// Without !fpmath accuracy information, we can't do more because we don't
1120111205
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
1120211206
// f16 is always accurate enough
11203-
if (!AllowInaccurateRcp && VT != MVT::f16)
11207+
if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
1120411208
return SDValue();
1120511209

1120611210
if (CLHS->isExactlyValue(1.0)) {
@@ -11227,9 +11231,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
1122711231
}
1122811232
}
1122911233

11230-
// For f16 require afn or arcp.
11234+
// For f16 and bf16 require afn or arcp.
1123111235
// For f32 require afn.
11232-
if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
11236+
if (!AllowInaccurateRcp &&
11237+
((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
1123311238
return SDValue();
1123411239

1123511240
// Turn into multiply by the reciprocal.
@@ -11620,7 +11625,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
1162011625
if (VT == MVT::f64)
1162111626
return LowerFDIV64(Op, DAG);
1162211627

11623-
if (VT == MVT::f16)
11628+
if (VT == MVT::f16 || VT == MVT::bf16)
1162411629
return LowerFDIV16(Op, DAG);
1162511630

1162611631
llvm_unreachable("Unexpected type for fdiv");

llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
4+
5+
/* TODO: Support safe bf16 fdiv lowering.
6+
define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
7+
%fdiv = fdiv bfloat %x, %y
8+
ret bfloat %fdiv
9+
}
10+
*/
11+
12+
define bfloat @v_rcp_bf16(bfloat %x) {
13+
; GFX1250-TRUE16-LABEL: v_rcp_bf16:
14+
; GFX1250-TRUE16: ; %bb.0:
15+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
16+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
17+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
18+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
19+
;
20+
; GFX1250-FAKE16-LABEL: v_rcp_bf16:
21+
; GFX1250-FAKE16: ; %bb.0:
22+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
23+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
24+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
25+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
26+
%fdiv = fdiv bfloat 1.0, %x
27+
ret bfloat %fdiv
28+
}
29+
30+
define bfloat @v_rcp_bf16_abs(bfloat %x) {
31+
; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs:
32+
; GFX1250-TRUE16: ; %bb.0:
33+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
34+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
35+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l|
36+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
37+
;
38+
; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs:
39+
; GFX1250-FAKE16: ; %bb.0:
40+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
41+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
42+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0|
43+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
44+
%fabs = call bfloat @llvm.fabs.bf16(bfloat %x)
45+
%fdiv = fdiv bfloat 1.0, %fabs
46+
ret bfloat %fdiv
47+
}
48+
49+
define bfloat @v_rcp_bf16_afn(bfloat %x) {
50+
; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn:
51+
; GFX1250-TRUE16: ; %bb.0:
52+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
53+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
54+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
55+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
56+
;
57+
; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn:
58+
; GFX1250-FAKE16: ; %bb.0:
59+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
60+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
61+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
62+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
63+
%fdiv = fdiv afn bfloat 1.0, %x
64+
ret bfloat %fdiv
65+
}
66+
67+
define bfloat @v_rcp_bf16_neg(bfloat %x) {
68+
; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg:
69+
; GFX1250-TRUE16: ; %bb.0:
70+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
71+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
72+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
73+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
74+
;
75+
; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg:
76+
; GFX1250-FAKE16: ; %bb.0:
77+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
78+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
79+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
80+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
81+
%fdiv = fdiv bfloat -1.0, %x
82+
ret bfloat %fdiv
83+
}
84+
85+
; TODO: Support lowering to v_rsq_bf16.
86+
define bfloat @v_rsq_bf16(bfloat %x) {
87+
; GFX1250-TRUE16-LABEL: v_rsq_bf16:
88+
; GFX1250-TRUE16: ; %bb.0:
89+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
90+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
91+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
92+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
93+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
94+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
95+
;
96+
; GFX1250-FAKE16-LABEL: v_rsq_bf16:
97+
; GFX1250-FAKE16: ; %bb.0:
98+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
99+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
100+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
101+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
102+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
103+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
104+
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
105+
%fdiv = fdiv contract bfloat 1.0, %sqrt
106+
ret bfloat %fdiv
107+
}
108+
109+
; TODO: Support lowering to v_rsq_bf16.
110+
define bfloat @v_rsq_bf16_neg(bfloat %x) {
111+
; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
112+
; GFX1250-TRUE16: ; %bb.0:
113+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
114+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
115+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
116+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
117+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
118+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
119+
;
120+
; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
121+
; GFX1250-FAKE16: ; %bb.0:
122+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
123+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
124+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
125+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
126+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
127+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
128+
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
129+
%fdiv = fdiv contract bfloat -1.0, %sqrt
130+
ret bfloat %fdiv
131+
}
132+
133+
; TODO: Support lowering to v_rsq_bf16.
134+
define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
135+
; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
136+
; GFX1250-TRUE16: ; %bb.0:
137+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
138+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
139+
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
140+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
141+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
142+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
143+
; GFX1250-TRUE16-NEXT: v_nop
144+
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
145+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
146+
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
147+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
148+
;
149+
; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
150+
; GFX1250-FAKE16: ; %bb.0:
151+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
152+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
153+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
154+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
155+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
156+
; GFX1250-FAKE16-NEXT: v_nop
157+
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
158+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
159+
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
160+
%fdiv = fdiv contract bfloat 1.0, %sqrt
161+
%r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0
162+
%r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1
163+
ret <2 x bfloat> %r2
164+
}
165+
166+
; TODO: Support lowering to v_rsq_bf16.
167+
define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
168+
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
169+
; GFX1250-TRUE16: ; %bb.0:
170+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
171+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
172+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
173+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
174+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
175+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
176+
;
177+
; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0:
178+
; GFX1250-FAKE16: ; %bb.0:
179+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
180+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
181+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
182+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
183+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
184+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
185+
%sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x)
186+
%fdiv = fdiv contract bfloat 1.0, %sqrt
187+
ret bfloat %fdiv
188+
}
189+
190+
; TODO: Support lowering to v_rsq_bf16.
191+
define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
192+
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
193+
; GFX1250-TRUE16: ; %bb.0:
194+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
195+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
196+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
197+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
198+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
199+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
200+
;
201+
; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1:
202+
; GFX1250-FAKE16: ; %bb.0:
203+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
204+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
205+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
206+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
207+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
208+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
209+
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
210+
%fdiv = fdiv bfloat 1.0, %sqrt
211+
ret bfloat %fdiv
212+
}
213+
214+
; TODO: Support lowering to v_rsq_bf16.
215+
define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
216+
; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
217+
; GFX1250-TRUE16: ; %bb.0:
218+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
219+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
220+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
221+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
222+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
223+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
224+
;
225+
; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1:
226+
; GFX1250-FAKE16: ; %bb.0:
227+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
228+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
229+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
230+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
231+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
232+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
233+
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
234+
%fdiv = fdiv bfloat -1.0, %sqrt
235+
ret bfloat %fdiv
236+
}
237+
238+
define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
239+
; GFX1250-TRUE16-LABEL: v_rsq_v2bf16:
240+
; GFX1250-TRUE16: ; %bb.0:
241+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
242+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
243+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
244+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
245+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
246+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
247+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
248+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
249+
;
250+
; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
251+
; GFX1250-FAKE16: ; %bb.0:
252+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
253+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
254+
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
255+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
256+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
257+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
258+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
259+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
260+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
261+
; GFX1250-FAKE16-NEXT: v_nop
262+
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
263+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
264+
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
265+
%fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt
266+
ret <2 x bfloat> %fdiv
267+
}
268+
269+
define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
270+
; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16:
271+
; GFX1250-TRUE16: ; %bb.0:
272+
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
273+
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
274+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
275+
; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
276+
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
277+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h
278+
; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
279+
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
280+
;
281+
; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
282+
; GFX1250-FAKE16: ; %bb.0:
283+
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
284+
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
285+
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
286+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
287+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
288+
; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
289+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
290+
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
291+
; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
292+
; GFX1250-FAKE16-NEXT: v_nop
293+
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
294+
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
295+
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
296+
%fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt
297+
ret <2 x bfloat> %fdiv
298+
}

0 commit comments

Comments
 (0)