11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s
33; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
4+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
45
56; TODO: Add global-isel when it can support bf16
67
@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
910; GCN: ; %bb.0:
1011; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1112; GCN-NEXT: ; return to shader part epilog
13+ ;
14+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
15+ ; GFX1250: ; %bb.0:
16+ ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17+ ; GFX1250-NEXT: ; return to shader part epilog
1218 %cvt = fpext bfloat %v to float
1319 ret float %cvt
1420}
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
1925; GCN-NEXT: s_lshl_b32 s0, s0, 16
2026; GCN-NEXT: v_mov_b32_e32 v0, s0
2127; GCN-NEXT: ; return to shader part epilog
28+ ;
29+ ; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
30+ ; GFX1250: ; %bb.0:
31+ ; GFX1250-NEXT: s_lshl_b32 s0, s0, 16
32+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
33+ ; GFX1250-NEXT: v_mov_b32_e32 v0, s0
34+ ; GFX1250-NEXT: ; return to shader part epilog
2235 %cvt = fpext bfloat %v to float
2336 ret float %cvt
2437}
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
4760; GFX-950: ; %bb.0:
4861; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
4962; GFX-950-NEXT: ; return to shader part epilog
63+ ;
64+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
65+ ; GFX1250: ; %bb.0:
66+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
67+ ; GFX1250-NEXT: ; return to shader part epilog
5068 %res = fptrunc <2 x float > %src to <2 x bfloat>
5169 %cast = bitcast <2 x bfloat> %res to float
5270 ret float %cast
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
8098; GFX-950-NEXT: v_mov_b32_e32 v0, s1
8199; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0
82100; GFX-950-NEXT: ; return to shader part epilog
101+ ;
102+ ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
103+ ; GFX1250: ; %bb.0:
104+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1
105+ ; GFX1250-NEXT: ; return to shader part epilog
83106 %res = fptrunc <2 x float > %src to <2 x bfloat>
84107 %cast = bitcast <2 x bfloat> %res to float
85108 ret float %cast
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
103126; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
104127; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
105128; GFX-950-NEXT: ; return to shader part epilog
129+ ;
130+ ; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
131+ ; GFX1250: ; %bb.0:
132+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
133+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
134+ ; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
135+ ; GFX1250-NEXT: ; return to shader part epilog
106136 %trunc = fptrunc float %src to bfloat
107137 %ext = fpext bfloat %trunc to float
108138 ret float %ext
@@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
172202; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
173203; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174204; GFX-950-NEXT: ; return to shader part epilog
205+ ;
206+ ; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
207+ ; GFX1250: ; %bb.0:
208+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3]
209+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1]
210+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
211+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
212+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
213+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
214+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]|
215+ ; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5]
216+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7]
217+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
218+ ; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1
219+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
220+ ; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
221+ ; GFX1250-NEXT: s_wait_alu 0xf1ff
222+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
223+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
224+ ; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9
225+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10
226+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
227+ ; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
228+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11
229+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
230+ ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
231+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
232+ ; GFX1250-NEXT: s_wait_alu 0xfffe
233+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
234+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
235+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
236+ ; GFX1250-NEXT: ; return to shader part epilog
175237 %res = fptrunc <2 x double > %src to <2 x bfloat>
176238 %cast = bitcast <2 x bfloat> %res to float
177239 ret float %cast
@@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
201263; GFX-950: ; %bb.0: ; %entry
202264; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
203265; GFX-950-NEXT: ; return to shader part epilog
266+ ;
267+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
268+ ; GFX1250: ; %bb.0: ; %entry
269+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
270+ ; GFX1250-NEXT: ; return to shader part epilog
204271entry:
205272 %a.cvt = fptrunc float %a to bfloat
206273 %b.cvt = fptrunc float %b to bfloat
@@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
236303; GFX-950: ; %bb.0: ; %entry
237304; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
238305; GFX-950-NEXT: ; return to shader part epilog
306+ ;
307+ ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
308+ ; GFX1250: ; %bb.0: ; %entry
309+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
310+ ; GFX1250-NEXT: ; return to shader part epilog
239311entry:
240312 %a.neg = fneg float %a
241313 %a.cvt = fptrunc float %a.neg to bfloat
@@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
269341; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
270342; GFX-950-NEXT: flat_store_short v[2:3], v0
271343; GFX-950-NEXT: s_endpgm
344+ ;
345+ ; GFX1250-LABEL: fptrunc_f32_to_bf16:
346+ ; GFX1250: ; %bb.0: ; %entry
347+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
348+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
349+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
350+ ; GFX1250-NEXT: s_endpgm
272351entry:
273352 %a.cvt = fptrunc float %a to bfloat
274353 store bfloat %a.cvt , ptr %out
@@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
298377; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
299378; GFX-950-NEXT: flat_store_short v[2:3], v0
300379; GFX-950-NEXT: s_endpgm
380+ ;
381+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
382+ ; GFX1250: ; %bb.0: ; %entry
383+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
384+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
385+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
386+ ; GFX1250-NEXT: s_endpgm
301387entry:
302388 %a.abs = call float @llvm.fabs.f32 (float %a )
303389 %a.cvt = fptrunc float %a.abs to bfloat
@@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
328414; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
329415; GFX-950-NEXT: flat_store_short v[2:3], v0
330416; GFX-950-NEXT: s_endpgm
417+ ;
418+ ; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
419+ ; GFX1250: ; %bb.0: ; %entry
420+ ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
421+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
422+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
423+ ; GFX1250-NEXT: s_endpgm
331424entry:
332425 %a.neg = fneg float %a
333426 %a.cvt = fptrunc float %a.neg to bfloat
@@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
373466; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
374467; GFX-950-NEXT: flat_store_short v[2:3], v0
375468; GFX-950-NEXT: s_endpgm
469+ ;
470+ ; GFX1250-LABEL: fptrunc_f64_to_bf16:
471+ ; GFX1250: ; %bb.0: ; %entry
472+ ; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
473+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
474+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
475+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
476+ ; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
477+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
478+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0
479+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
480+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
481+ ; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
482+ ; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
483+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
484+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
485+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
486+ ; GFX1250-NEXT: s_endpgm
376487entry:
377488 %a.cvt = fptrunc double %a to bfloat
378489 store bfloat %a.cvt , ptr %out
@@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
417528; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
418529; GFX-950-NEXT: flat_store_short v[2:3], v0
419530; GFX-950-NEXT: s_endpgm
531+ ;
532+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
533+ ; GFX1250: ; %bb.0: ; %entry
534+ ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
535+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
536+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
537+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
538+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5]
539+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
540+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
541+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
542+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
543+ ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
544+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
545+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
546+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
547+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
548+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
549+ ; GFX1250-NEXT: s_endpgm
420550entry:
421551 %a.neg = fneg double %a
422552 %a.cvt = fptrunc double %a.neg to bfloat
@@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
462592; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
463593; GFX-950-NEXT: flat_store_short v[2:3], v0
464594; GFX-950-NEXT: s_endpgm
595+ ;
596+ ; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
597+ ; GFX1250: ; %bb.0: ; %entry
598+ ; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
599+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
600+ ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
601+ ; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
602+ ; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
603+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
604+ ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
605+ ; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
606+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
607+ ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
608+ ; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
609+ ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
610+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
611+ ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
612+ ; GFX1250-NEXT: flat_store_b16 v[2:3], v0
613+ ; GFX1250-NEXT: s_endpgm
465614entry:
466615 %a.abs = call double @llvm.fabs.f64 (double %a )
467616 %a.cvt = fptrunc double %a.abs to bfloat
0 commit comments