@@ -82,78 +82,69 @@ define bfloat @v_rcp_bf16_neg(bfloat %x) {
82
82
ret bfloat %fdiv
83
83
}
84
84
85
- ; TODO: Support lowering to v_rsq_bf16.
86
85
define bfloat @v_rsq_bf16 (bfloat %x ) {
87
86
; GFX1250-TRUE16-LABEL: v_rsq_bf16:
88
87
; GFX1250-TRUE16: ; %bb.0:
89
88
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
90
89
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
91
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
92
- ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
93
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
90
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
94
91
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
95
92
;
96
93
; GFX1250-FAKE16-LABEL: v_rsq_bf16:
97
94
; GFX1250-FAKE16: ; %bb.0:
98
95
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
99
96
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
100
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
101
- ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
102
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
97
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
103
98
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
104
99
%sqrt = call contract bfloat @llvm.sqrt.bf16 (bfloat %x )
105
100
%fdiv = fdiv contract bfloat 1 .0 , %sqrt
106
101
ret bfloat %fdiv
107
102
}
108
103
109
- ; TODO: Support lowering to v_rsq_bf16.
110
104
define bfloat @v_rsq_bf16_neg (bfloat %x ) {
111
105
; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
112
106
; GFX1250-TRUE16: ; %bb.0:
113
107
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
114
108
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
115
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
109
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
110
+ ; GFX1250-TRUE16-NEXT: v_nop
116
111
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
117
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, - v0.l
112
+ ; GFX1250-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
118
113
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
119
114
;
120
115
; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
121
116
; GFX1250-FAKE16: ; %bb.0:
122
117
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
123
118
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
124
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
119
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
120
+ ; GFX1250-FAKE16-NEXT: v_nop
125
121
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
126
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, - v0
122
+ ; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
127
123
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
128
124
%sqrt = call contract bfloat @llvm.sqrt.bf16 (bfloat %x )
129
125
%fdiv = fdiv contract bfloat -1 .0 , %sqrt
130
126
ret bfloat %fdiv
131
127
}
132
128
133
- ; TODO: Support lowering to v_rsq_bf16.
134
129
define <2 x bfloat> @v_rsq_bf16_multi_use (bfloat %x ) {
135
130
; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
136
131
; GFX1250-TRUE16: ; %bb.0:
137
132
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
138
133
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
139
134
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
140
- ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
141
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
142
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
135
+ ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
136
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v1.h, v1.l
143
137
; GFX1250-TRUE16-NEXT: v_nop
144
- ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
145
- ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
146
138
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
147
139
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
148
140
;
149
141
; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
150
142
; GFX1250-FAKE16: ; %bb.0:
151
143
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
152
144
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
153
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
154
- ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
155
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
145
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v0
156
146
; GFX1250-FAKE16-NEXT: v_nop
147
+ ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
157
148
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
158
149
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
159
150
%sqrt = call contract bfloat @llvm.sqrt.bf16 (bfloat %x )
@@ -163,7 +154,6 @@ define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
163
154
ret <2 x bfloat> %r2
164
155
}
165
156
166
- ; TODO: Support lowering to v_rsq_bf16.
167
157
define bfloat @v_rsq_bf16_missing_contract0 (bfloat %x ) {
168
158
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
169
159
; GFX1250-TRUE16: ; %bb.0:
@@ -187,7 +177,6 @@ define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
187
177
ret bfloat %fdiv
188
178
}
189
179
190
- ; TODO: Support lowering to v_rsq_bf16.
191
180
define bfloat @v_rsq_bf16_missing_contract1 (bfloat %x ) {
192
181
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
193
182
; GFX1250-TRUE16: ; %bb.0:
@@ -211,7 +200,6 @@ define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
211
200
ret bfloat %fdiv
212
201
}
213
202
214
- ; TODO: Support lowering to v_rsq_bf16.
215
203
define bfloat @v_neg_rsq_bf16_missing_contract1 (bfloat %x ) {
216
204
; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
217
205
; GFX1250-TRUE16: ; %bb.0:
@@ -240,24 +228,18 @@ define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
240
228
; GFX1250-TRUE16: ; %bb.0:
241
229
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
242
230
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
243
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
244
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
245
- ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
246
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
247
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
231
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.h, v0.h
232
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
248
233
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
249
234
;
250
235
; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
251
236
; GFX1250-FAKE16: ; %bb.0:
252
237
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
253
238
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
254
239
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
255
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
256
- ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
257
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
258
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
259
- ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
260
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
240
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
241
+ ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
242
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v1
261
243
; GFX1250-FAKE16-NEXT: v_nop
262
244
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
263
245
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -271,25 +253,24 @@ define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
271
253
; GFX1250-TRUE16: ; %bb.0:
272
254
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
273
255
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
274
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
275
- ; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
276
- ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2 )
277
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, - v0.h
278
- ; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, - v0.l
256
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.h, v0.h
257
+ ; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
258
+ ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1 )
259
+ ; GFX1250-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v0.h
260
+ ; GFX1250-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
279
261
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
280
262
;
281
263
; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
282
264
; GFX1250-FAKE16: ; %bb.0:
283
265
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
284
266
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
285
267
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
286
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
268
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
287
269
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
288
- ; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
289
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
290
- ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
291
- ; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
292
- ; GFX1250-FAKE16-NEXT: v_nop
270
+ ; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v1
271
+ ; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
272
+ ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
273
+ ; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
293
274
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
294
275
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
295
276
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16 (<2 x bfloat> %a )
0 commit comments