1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2
2
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s
3
3
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
4
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
4
5
5
6
; TODO: Add global-isel when it can support bf16
6
7
@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
9
10
; GCN: ; %bb.0:
10
11
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
11
12
; GCN-NEXT: ; return to shader part epilog
13
+ ;
14
+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
15
+ ; GFX1250: ; %bb.0:
16
+ ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17
+ ; GFX1250-NEXT: ; return to shader part epilog
12
18
%cvt = fpext bfloat %v to float
13
19
ret float %cvt
14
20
}
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
19
25
; GCN-NEXT: s_lshl_b32 s0, s0, 16
20
26
; GCN-NEXT: v_mov_b32_e32 v0, s0
21
27
; GCN-NEXT: ; return to shader part epilog
28
+ ;
29
+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
30
+ ; GFX1250: ; %bb.0:
31
+ ; GFX1250-NEXT: s_lshl_b32 s0, s0, 16
32
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
33
+ ; GFX1250-NEXT: v_mov_b32_e32 v0, s0
34
+ ; GFX1250-NEXT: ; return to shader part epilog
22
35
%cvt = fpext bfloat %v to float
23
36
ret float %cvt
24
37
}
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
47
60
; GFX-950: ; %bb.0:
48
61
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
49
62
; GFX-950-NEXT: ; return to shader part epilog
63
+ ;
64
+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
65
+ ; GFX1250: ; %bb.0:
66
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
67
+ ; GFX1250-NEXT: ; return to shader part epilog
50
68
%res = fptrunc <2 x float > %src to <2 x bfloat>
51
69
%cast = bitcast <2 x bfloat> %res to float
52
70
ret float %cast
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
80
98
; GFX-950-NEXT: v_mov_b32_e32 v0, s1
81
99
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0
82
100
; GFX-950-NEXT: ; return to shader part epilog
101
+ ;
102
+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
103
+ ; GFX1250: ; %bb.0:
104
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1
105
+ ; GFX1250-NEXT: ; return to shader part epilog
83
106
%res = fptrunc <2 x float > %src to <2 x bfloat>
84
107
%cast = bitcast <2 x bfloat> %res to float
85
108
ret float %cast
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
103
126
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
104
127
; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
105
128
; GFX-950-NEXT: ; return to shader part epilog
129
+ ;
130
+ ; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
131
+ ; GFX1250: ; %bb.0:
132
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
133
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
134
+ ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
135
+ ; GFX1250-NEXT: ; return to shader part epilog
106
136
%trunc = fptrunc float %src to bfloat
107
137
%ext = fpext bfloat %trunc to float
108
138
ret float %ext
@@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
172
202
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
173
203
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174
204
; GFX-950-NEXT: ; return to shader part epilog
205
+ ;
206
+ ; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
207
+ ; GFX1250: ; %bb.0:
208
+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3]
209
+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1]
210
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
211
+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
212
+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
213
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
214
+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]|
215
+ ; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5]
216
+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7]
217
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
218
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1
219
+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
220
+ ; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
221
+ ; GFX1250-NEXT: s_wait_alu 0xf1ff
222
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
223
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
224
+ ; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9
225
+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10
226
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
227
+ ; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
228
+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11
229
+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
230
+ ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
231
+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
232
+ ; GFX1250-NEXT: s_wait_alu 0xfffe
233
+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
234
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
235
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
236
+ ; GFX1250-NEXT: ; return to shader part epilog
175
237
%res = fptrunc <2 x double > %src to <2 x bfloat>
176
238
%cast = bitcast <2 x bfloat> %res to float
177
239
ret float %cast
@@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
201
263
; GFX-950: ; %bb.0: ; %entry
202
264
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
203
265
; GFX-950-NEXT: ; return to shader part epilog
266
+ ;
267
+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
268
+ ; GFX1250: ; %bb.0: ; %entry
269
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
270
+ ; GFX1250-NEXT: ; return to shader part epilog
204
271
entry:
205
272
%a.cvt = fptrunc float %a to bfloat
206
273
%b.cvt = fptrunc float %b to bfloat
@@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
236
303
; GFX-950: ; %bb.0: ; %entry
237
304
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
238
305
; GFX-950-NEXT: ; return to shader part epilog
306
+ ;
307
+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
308
+ ; GFX1250: ; %bb.0: ; %entry
309
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
310
+ ; GFX1250-NEXT: ; return to shader part epilog
239
311
entry:
240
312
%a.neg = fneg float %a
241
313
%a.cvt = fptrunc float %a.neg to bfloat
@@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
269
341
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
270
342
; GFX-950-NEXT: flat_store_short v[2:3], v0
271
343
; GFX-950-NEXT: s_endpgm
344
+ ;
345
+ ; GFX1250-LABEL: fptrunc_f32_to_bf16:
346
+ ; GFX1250: ; %bb.0: ; %entry
347
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
348
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
349
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
350
+ ; GFX1250-NEXT: s_endpgm
272
351
entry:
273
352
%a.cvt = fptrunc float %a to bfloat
274
353
store bfloat %a.cvt , ptr %out
@@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
298
377
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
299
378
; GFX-950-NEXT: flat_store_short v[2:3], v0
300
379
; GFX-950-NEXT: s_endpgm
380
+ ;
381
+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
382
+ ; GFX1250: ; %bb.0: ; %entry
383
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
384
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
385
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
386
+ ; GFX1250-NEXT: s_endpgm
301
387
entry:
302
388
%a.abs = call float @llvm.fabs.f32 (float %a )
303
389
%a.cvt = fptrunc float %a.abs to bfloat
@@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
328
414
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
329
415
; GFX-950-NEXT: flat_store_short v[2:3], v0
330
416
; GFX-950-NEXT: s_endpgm
417
+ ;
418
+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
419
+ ; GFX1250: ; %bb.0: ; %entry
420
+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
421
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
422
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
423
+ ; GFX1250-NEXT: s_endpgm
331
424
entry:
332
425
%a.neg = fneg float %a
333
426
%a.cvt = fptrunc float %a.neg to bfloat
@@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
373
466
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
374
467
; GFX-950-NEXT: flat_store_short v[2:3], v0
375
468
; GFX-950-NEXT: s_endpgm
469
+ ;
470
+ ; GFX1250-LABEL: fptrunc_f64_to_bf16:
471
+ ; GFX1250: ; %bb.0: ; %entry
472
+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
473
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
474
+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
475
+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
476
+ ; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
477
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
478
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0
479
+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
480
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
481
+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
482
+ ; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
483
+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
484
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
485
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
486
+ ; GFX1250-NEXT: s_endpgm
376
487
entry:
377
488
%a.cvt = fptrunc double %a to bfloat
378
489
store bfloat %a.cvt , ptr %out
@@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
417
528
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
418
529
; GFX-950-NEXT: flat_store_short v[2:3], v0
419
530
; GFX-950-NEXT: s_endpgm
531
+ ;
532
+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
533
+ ; GFX1250: ; %bb.0: ; %entry
534
+ ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
535
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
536
+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
537
+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
538
+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5]
539
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
540
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
541
+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
542
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
543
+ ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
544
+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
545
+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
546
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
547
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
548
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
549
+ ; GFX1250-NEXT: s_endpgm
420
550
entry:
421
551
%a.neg = fneg double %a
422
552
%a.cvt = fptrunc double %a.neg to bfloat
@@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
462
592
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
463
593
; GFX-950-NEXT: flat_store_short v[2:3], v0
464
594
; GFX-950-NEXT: s_endpgm
595
+ ;
596
+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
597
+ ; GFX1250: ; %bb.0: ; %entry
598
+ ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
599
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
600
+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
601
+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
602
+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
603
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
604
+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
605
+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
606
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
607
+ ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
608
+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
609
+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
610
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
611
+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
612
+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
613
+ ; GFX1250-NEXT: s_endpgm
465
614
entry:
466
615
%a.abs = call double @llvm.fabs.f64 (double %a )
467
616
%a.cvt = fptrunc double %a.abs to bfloat
0 commit comments