11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
3+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
34
45define amdgpu_cs float @test_cvt_f32_bf8_byte0 (i32 %a ) {
56; GFX12-LABEL: test_cvt_f32_bf8_byte0:
67; GFX12: ; %bb.0:
78; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
89; GFX12-NEXT: ; return to shader part epilog
10+ ;
11+ ; GFX1250-LABEL: test_cvt_f32_bf8_byte0:
12+ ; GFX1250: ; %bb.0:
13+ ; GFX1250-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
14+ ; GFX1250-NEXT: ; return to shader part epilog
915 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
1016 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8 (i32 %tmp0 , i32 0 )
1117 ret float %ret
@@ -16,6 +22,11 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
1622; GFX12: ; %bb.0:
1723; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
1824; GFX12-NEXT: ; return to shader part epilog
25+ ;
26+ ; GFX1250-LABEL: test_cvt_f32_bf8_byte1:
27+ ; GFX1250: ; %bb.0:
28+ ; GFX1250-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
29+ ; GFX1250-NEXT: ; return to shader part epilog
1930 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
2031 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8 (i32 %tmp0 , i32 1 )
2132 ret float %ret
@@ -26,6 +37,11 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
2637; GFX12: ; %bb.0:
2738; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
2839; GFX12-NEXT: ; return to shader part epilog
40+ ;
41+ ; GFX1250-LABEL: test_cvt_f32_bf8_byte2:
42+ ; GFX1250: ; %bb.0:
43+ ; GFX1250-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
44+ ; GFX1250-NEXT: ; return to shader part epilog
2945 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
3046 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8 (i32 %tmp0 , i32 2 )
3147 ret float %ret
@@ -36,6 +52,11 @@ define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
3652; GFX12: ; %bb.0:
3753; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
3854; GFX12-NEXT: ; return to shader part epilog
55+ ;
56+ ; GFX1250-LABEL: test_cvt_f32_fp8_byte3:
57+ ; GFX1250: ; %bb.0:
58+ ; GFX1250-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
59+ ; GFX1250-NEXT: ; return to shader part epilog
3960 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
4061 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8 (i32 %tmp0 , i32 3 )
4162 ret float %ret
@@ -47,6 +68,13 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr
4768; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
4869; GFX12-NEXT: global_store_b32 v[3:4], v2, off
4970; GFX12-NEXT: s_endpgm
71+ ;
72+ ; GFX1250-LABEL: test_cvt_pk_bf8_f32_word0:
73+ ; GFX1250: ; %bb.0:
74+ ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
75+ ; GFX1250-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
76+ ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
77+ ; GFX1250-NEXT: s_endpgm
5078 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
5179 %tmp1 = bitcast i32 %tmp0 to float
5280 %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32 (float %tmp1 , float %y , i32 %old , i1 false )
@@ -62,6 +90,15 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr
6290; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
6391; GFX12-NEXT: global_store_b32 v[3:4], v2, off
6492; GFX12-NEXT: s_endpgm
93+ ;
94+ ; GFX1250-LABEL: test_cvt_pk_fp8_f32_word1:
95+ ; GFX1250: ; %bb.0:
96+ ; GFX1250-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
97+ ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
98+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
99+ ; GFX1250-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
100+ ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
101+ ; GFX1250-NEXT: s_endpgm
65102 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
66103 %tmp1 = bitcast i32 %tmp0 to float
67104 %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32 (float %tmp1 , float %y , i32 %old , i1 true )
@@ -75,6 +112,13 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
75112; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
76113; GFX12-NEXT: global_store_b32 v[3:4], v2, off
77114; GFX12-NEXT: s_endpgm
115+ ;
116+ ; GFX1250-LABEL: test_cvt_sr_bf8_f32_byte0:
117+ ; GFX1250: ; %bb.0:
118+ ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
119+ ; GFX1250-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
120+ ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
121+ ; GFX1250-NEXT: s_endpgm
78122 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
79123 %tmp1 = bitcast i32 %tmp0 to float
80124 %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32 (float %tmp1 , i32 %r , i32 %old , i32 0 )
@@ -88,6 +132,13 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
88132; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
89133; GFX12-NEXT: global_store_b32 v[3:4], v2, off
90134; GFX12-NEXT: s_endpgm
135+ ;
136+ ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte1:
137+ ; GFX1250: ; %bb.0:
138+ ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
139+ ; GFX1250-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
140+ ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
141+ ; GFX1250-NEXT: s_endpgm
91142 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
92143 %tmp1 = bitcast i32 %tmp0 to float
93144 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32 (float %tmp1 , i32 %r , i32 %old , i32 1 )
@@ -101,6 +152,13 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr a
101152; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
102153; GFX12-NEXT: global_store_b32 v[3:4], v2, off
103154; GFX12-NEXT: s_endpgm
155+ ;
156+ ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte2:
157+ ; GFX1250: ; %bb.0:
158+ ; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
159+ ; GFX1250-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
160+ ; GFX1250-NEXT: global_store_b32 v[4:5], v2, off
161+ ; GFX1250-NEXT: s_endpgm
104162 %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32 (i32 %a , i32 228 , i32 15 , i32 15 , i1 1 )
105163 %tmp1 = bitcast i32 %tmp0 to float
106164 %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32 (float %tmp1 , i32 %r , i32 %old , i32 2 )
0 commit comments