Skip to content

Commit fbeb801

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_cvt_pk_bf16_f32 on gfx1250 (#150053)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent b2c38f1 commit fbeb801

11 files changed

+502
-0
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1918,6 +1918,7 @@ let AssemblerPredicate = isGFX11Plus in {
19181918

19191919
// These instructions differ from GFX12 variant by supporting DPP:
19201920
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
1921+
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
19211922

19221923
//===----------------------------------------------------------------------===//
19231924
// GFX10.

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s
33
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
45

56
; TODO: Add global-isel when it can support bf16
67

@@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
910
; GCN: ; %bb.0:
1011
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1112
; GCN-NEXT: ; return to shader part epilog
13+
;
14+
; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
15+
; GFX1250: ; %bb.0:
16+
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
17+
; GFX1250-NEXT: ; return to shader part epilog
1218
%cvt = fpext bfloat %v to float
1319
ret float %cvt
1420
}
@@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
1925
; GCN-NEXT: s_lshl_b32 s0, s0, 16
2026
; GCN-NEXT: v_mov_b32_e32 v0, s0
2127
; GCN-NEXT: ; return to shader part epilog
28+
;
29+
; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
30+
; GFX1250: ; %bb.0:
31+
; GFX1250-NEXT: s_lshl_b32 s0, s0, 16
32+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
33+
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
34+
; GFX1250-NEXT: ; return to shader part epilog
2235
%cvt = fpext bfloat %v to float
2336
ret float %cvt
2437
}
@@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
4760
; GFX-950: ; %bb.0:
4861
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
4962
; GFX-950-NEXT: ; return to shader part epilog
63+
;
64+
; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
65+
; GFX1250: ; %bb.0:
66+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
67+
; GFX1250-NEXT: ; return to shader part epilog
5068
%res = fptrunc <2 x float> %src to <2 x bfloat>
5169
%cast = bitcast <2 x bfloat> %res to float
5270
ret float %cast
@@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
8098
; GFX-950-NEXT: v_mov_b32_e32 v0, s1
8199
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0
82100
; GFX-950-NEXT: ; return to shader part epilog
101+
;
102+
; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
103+
; GFX1250: ; %bb.0:
104+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1
105+
; GFX1250-NEXT: ; return to shader part epilog
83106
%res = fptrunc <2 x float> %src to <2 x bfloat>
84107
%cast = bitcast <2 x bfloat> %res to float
85108
ret float %cast
@@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
103126
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
104127
; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
105128
; GFX-950-NEXT: ; return to shader part epilog
129+
;
130+
; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
131+
; GFX1250: ; %bb.0:
132+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
133+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
134+
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
135+
; GFX1250-NEXT: ; return to shader part epilog
106136
%trunc = fptrunc float %src to bfloat
107137
%ext = fpext bfloat %trunc to float
108138
ret float %ext
@@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
172202
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
173203
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174204
; GFX-950-NEXT: ; return to shader part epilog
205+
;
206+
; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
207+
; GFX1250: ; %bb.0:
208+
; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3]
209+
; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1]
210+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
211+
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
212+
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
213+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
214+
; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]|
215+
; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5]
216+
; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7]
217+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
218+
; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1
219+
; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
220+
; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
221+
; GFX1250-NEXT: s_wait_alu 0xf1ff
222+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
223+
; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
224+
; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9
225+
; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10
226+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
227+
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
228+
; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11
229+
; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
230+
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
231+
; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
232+
; GFX1250-NEXT: s_wait_alu 0xfffe
233+
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
234+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
235+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
236+
; GFX1250-NEXT: ; return to shader part epilog
175237
%res = fptrunc <2 x double> %src to <2 x bfloat>
176238
%cast = bitcast <2 x bfloat> %res to float
177239
ret float %cast
@@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
201263
; GFX-950: ; %bb.0: ; %entry
202264
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
203265
; GFX-950-NEXT: ; return to shader part epilog
266+
;
267+
; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
268+
; GFX1250: ; %bb.0: ; %entry
269+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
270+
; GFX1250-NEXT: ; return to shader part epilog
204271
entry:
205272
%a.cvt = fptrunc float %a to bfloat
206273
%b.cvt = fptrunc float %b to bfloat
@@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
236303
; GFX-950: ; %bb.0: ; %entry
237304
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
238305
; GFX-950-NEXT: ; return to shader part epilog
306+
;
307+
; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
308+
; GFX1250: ; %bb.0: ; %entry
309+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1|
310+
; GFX1250-NEXT: ; return to shader part epilog
239311
entry:
240312
%a.neg = fneg float %a
241313
%a.cvt = fptrunc float %a.neg to bfloat
@@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
269341
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
270342
; GFX-950-NEXT: flat_store_short v[2:3], v0
271343
; GFX-950-NEXT: s_endpgm
344+
;
345+
; GFX1250-LABEL: fptrunc_f32_to_bf16:
346+
; GFX1250: ; %bb.0: ; %entry
347+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
348+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
349+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
350+
; GFX1250-NEXT: s_endpgm
272351
entry:
273352
%a.cvt = fptrunc float %a to bfloat
274353
store bfloat %a.cvt, ptr %out
@@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
298377
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
299378
; GFX-950-NEXT: flat_store_short v[2:3], v0
300379
; GFX-950-NEXT: s_endpgm
380+
;
381+
; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
382+
; GFX1250: ; %bb.0: ; %entry
383+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
384+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
385+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
386+
; GFX1250-NEXT: s_endpgm
301387
entry:
302388
%a.abs = call float @llvm.fabs.f32(float %a)
303389
%a.cvt = fptrunc float %a.abs to bfloat
@@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
328414
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
329415
; GFX-950-NEXT: flat_store_short v[2:3], v0
330416
; GFX-950-NEXT: s_endpgm
417+
;
418+
; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
419+
; GFX1250: ; %bb.0: ; %entry
420+
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
421+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
422+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
423+
; GFX1250-NEXT: s_endpgm
331424
entry:
332425
%a.neg = fneg float %a
333426
%a.cvt = fptrunc float %a.neg to bfloat
@@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
373466
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
374467
; GFX-950-NEXT: flat_store_short v[2:3], v0
375468
; GFX-950-NEXT: s_endpgm
469+
;
470+
; GFX1250-LABEL: fptrunc_f64_to_bf16:
471+
; GFX1250: ; %bb.0: ; %entry
472+
; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
473+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
474+
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
475+
; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
476+
; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
477+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
478+
; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0
479+
; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
480+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
481+
; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
482+
; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
483+
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
484+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
485+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
486+
; GFX1250-NEXT: s_endpgm
376487
entry:
377488
%a.cvt = fptrunc double %a to bfloat
378489
store bfloat %a.cvt, ptr %out
@@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
417528
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
418529
; GFX-950-NEXT: flat_store_short v[2:3], v0
419530
; GFX-950-NEXT: s_endpgm
531+
;
532+
; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
533+
; GFX1250: ; %bb.0: ; %entry
534+
; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1]
535+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
536+
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
537+
; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
538+
; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5]
539+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
540+
; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
541+
; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
542+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
543+
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
544+
; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
545+
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
546+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
547+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
548+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
549+
; GFX1250-NEXT: s_endpgm
420550
entry:
421551
%a.neg = fneg double %a
422552
%a.cvt = fptrunc double %a.neg to bfloat
@@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
462592
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
463593
; GFX-950-NEXT: flat_store_short v[2:3], v0
464594
; GFX-950-NEXT: s_endpgm
595+
;
596+
; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
597+
; GFX1250: ; %bb.0: ; %entry
598+
; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
599+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
600+
; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
601+
; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]|
602+
; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
603+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
604+
; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
605+
; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
606+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
607+
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
608+
; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
609+
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
610+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
611+
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
612+
; GFX1250-NEXT: flat_store_b16 v[2:3], v0
613+
; GFX1250-NEXT: s_endpgm
465614
entry:
466615
%a.abs = call double @llvm.fabs.f64(double %a)
467616
%a.cvt = fptrunc double %a.abs to bfloat

llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
1515

1616
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
1717
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
18+
19+
v_cvt_pk_bf16_f32 v5, v1, v2
20+
// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
21+
22+
v_cvt_pk_bf16_f32 v5, v255, v255
23+
// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
24+
25+
v_cvt_pk_bf16_f32 v5, s1, s2
26+
// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
27+
28+
v_cvt_pk_bf16_f32 v5, s105, s105
29+
// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
30+
31+
v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
32+
// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
33+
34+
v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
35+
// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
36+
37+
v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
38+
// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
39+
40+
v_cvt_pk_bf16_f32 v5, m0, 0.5
41+
// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
42+
43+
v_cvt_pk_bf16_f32 v5, exec_lo, -1
44+
// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
45+
46+
v_cvt_pk_bf16_f32 v5, exec_hi, null
47+
// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
48+
49+
v_cvt_pk_bf16_f32 v5, null, exec_lo
50+
// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
51+
52+
v_cvt_pk_bf16_f32 v5, -1, exec_hi
53+
// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
54+
55+
v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
56+
// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
57+
58+
v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
59+
// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
60+
61+
v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
62+
// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]

llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,48 @@ v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3]
1515

1616
v_lshl_add_u64 v[2:3], v[4:5], v7, 12345
1717
// GFX1250: v_lshl_add_u64 v[2:3], v[4:5], v7, 0x3039 ; encoding: [0x02,0x00,0x52,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
18+
19+
v_cvt_pk_bf16_f32 v5, v1, v2
20+
// GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
21+
22+
v_cvt_pk_bf16_f32 v5, v255, v255
23+
// GFX1250: v_cvt_pk_bf16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x6d,0xd7,0xff,0xff,0x03,0x00]
24+
25+
v_cvt_pk_bf16_f32 v5, s1, s2
26+
// GFX1250: v_cvt_pk_bf16_f32 v5, s1, s2 ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x04,0x00,0x00]
27+
28+
v_cvt_pk_bf16_f32 v5, s105, s105
29+
// GFX1250: v_cvt_pk_bf16_f32 v5, s105, s105 ; encoding: [0x05,0x00,0x6d,0xd7,0x69,0xd2,0x00,0x00]
30+
31+
v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15
32+
// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x00,0x6d,0xd7,0x6a,0xf6,0x00,0x00]
33+
34+
v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456
35+
// GFX1250: v_cvt_pk_bf16_f32 v5, vcc_hi, 0xaf123456 ; encoding: [0x05,0x00,0x6d,0xd7,0x6b,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
36+
37+
v_cvt_pk_bf16_f32 v5, ttmp15, src_scc
38+
// GFX1250: v_cvt_pk_bf16_f32 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x6d,0xd7,0x7b,0xfa,0x01,0x00]
39+
40+
v_cvt_pk_bf16_f32 v5, m0, 0.5
41+
// GFX1250: v_cvt_pk_bf16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x6d,0xd7,0x7d,0xe0,0x01,0x00]
42+
43+
v_cvt_pk_bf16_f32 v5, exec_lo, -1
44+
// GFX1250: v_cvt_pk_bf16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x6d,0xd7,0x7e,0x82,0x01,0x00]
45+
46+
v_cvt_pk_bf16_f32 v5, exec_hi, null
47+
// GFX1250: v_cvt_pk_bf16_f32 v5, exec_hi, null ; encoding: [0x05,0x00,0x6d,0xd7,0x7f,0xf8,0x00,0x00]
48+
49+
v_cvt_pk_bf16_f32 v5, null, exec_lo
50+
// GFX1250: v_cvt_pk_bf16_f32 v5, null, exec_lo ; encoding: [0x05,0x00,0x6d,0xd7,0x7c,0xfc,0x00,0x00]
51+
52+
v_cvt_pk_bf16_f32 v5, -1, exec_hi
53+
// GFX1250: v_cvt_pk_bf16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x6d,0xd7,0xc1,0xfe,0x00,0x00]
54+
55+
v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2
56+
// GFX1250: v_cvt_pk_bf16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x6d,0xd7,0xf0,0xfa,0x00,0x08]
57+
58+
v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4
59+
// GFX1250: v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4 ; encoding: [0x05,0x00,0x6d,0xd7,0xfd,0xd4,0x00,0x10]
60+
61+
v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
62+
// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]

0 commit comments

Comments
 (0)