Skip to content

Commit 9bdaf1d

Browse files
committed
AMDGPU: Make v2f64 -> v2f16 conversion Legal only for unsafe fast math
On targets that support v_cvt_pk_f16_f32 instruction, if we make v2f64 -> v2f16 Legal, we will generate the following sequence of instructons: v_cvt_f32_f64_e32 v1, s[6:7] v_cvt_f32_f64_e32 v2, s[4:5] v_cvt_pk_f16_f32 v1, v2, v1 While it is okay when unsafe fast math flag is set, it may return imprecise results due to double rounding. This patch makes v2f64 -> v2f16 conversion Legal only when unsafe fast math is set. Fixes: SWDEV-523856
1 parent ad9f15a commit 9bdaf1d

File tree

3 files changed

+205
-5
lines changed

3 files changed

+205
-5
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,10 +1053,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10531053
}
10541054

10551055
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1056-
if (ST.hasCvtPkF16F32Inst())
1057-
FPTruncActions.legalFor(
1058-
{{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
1059-
else
1056+
if (ST.hasCvtPkF16F32Inst()) {
1057+
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}});
1058+
if (TM.Options.UnsafeFPMath)
1059+
FPTruncActions.legalFor({V2S16, V2S64});
1060+
} else
10601061
FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
10611062
FPTruncActions.scalarize(0).lower();
10621063

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
915915
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
916916
}
917917

918-
if (Subtarget->hasCvtPkF16F32Inst()) {
918+
if (Subtarget->hasCvtPkF16F32Inst() && TM.Options.UnsafeFPMath) {
919919
setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal);
920920
}
921921

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -check-prefixes=GFX950,GFX950-SAFE-SDAG %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -check-prefixes=GFX950,GFX950-SAFE-GISEL %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX950,GFX950-UNSAFE %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX950,GFX950-UNSAFE %s
6+
7+
define amdgpu_ps <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) {
8+
; GFX950-LABEL: v_test_cvt_v2f32_v2f16:
9+
; GFX950: ; %bb.0:
10+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
11+
; GFX950-NEXT: ; return to shader part epilog
12+
%res = fptrunc <2 x float> %src to <2 x half>
13+
ret <2 x half> %res
14+
}
15+
16+
define amdgpu_ps <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) {
17+
; GFX950-SAFE-SDAG-LABEL: v_test_cvt_v2f64_v2f16:
18+
; GFX950-SAFE-SDAG: ; %bb.0:
19+
; GFX950-SAFE-SDAG-NEXT: s_movk_i32 s0, 0x1ff
20+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v0, v1, s0, v0
21+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
22+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v1
23+
; GFX950-SAFE-SDAG-NEXT: s_movk_i32 s1, 0xffe
24+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
25+
; GFX950-SAFE-SDAG-NEXT: v_bfe_u32 v5, v1, 20, 11
26+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v0, v4, s1, v0
27+
; GFX950-SAFE-SDAG-NEXT: v_sub_u32_e32 v6, 0x3f1, v5
28+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v4, 0x1000, v0
29+
; GFX950-SAFE-SDAG-NEXT: v_med3_i32 v6, v6, 0, 13
30+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v7, v6, v4
31+
; GFX950-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v6, v6, v7
32+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4
33+
; GFX950-SAFE-SDAG-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5
34+
; GFX950-SAFE-SDAG-NEXT: v_lshl_or_b32 v6, v5, 12, v0
35+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
36+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v4, v7, v4
37+
; GFX950-SAFE-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
38+
; GFX950-SAFE-SDAG-NEXT: s_movk_i32 s2, 0x40f
39+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1
40+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
41+
; GFX950-SAFE-SDAG-NEXT: v_and_b32_e32 v6, 7, v4
42+
; GFX950-SAFE-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
43+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4
44+
; GFX950-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x8000
45+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
46+
; GFX950-SAFE-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
47+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
48+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
49+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v6, v6, v7
50+
; GFX950-SAFE-SDAG-NEXT: v_add_u32_e32 v4, v4, v6
51+
; GFX950-SAFE-SDAG-NEXT: v_mov_b32_e32 v6, 0x7c00
52+
; GFX950-SAFE-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
53+
; GFX950-SAFE-SDAG-NEXT: v_mov_b32_e32 v7, 0x7e00
54+
; GFX950-SAFE-SDAG-NEXT: s_nop 0
55+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
56+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
57+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
58+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
59+
; GFX950-SAFE-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v5
60+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
61+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
62+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v0, v1, s3, v0
63+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v1, v3, s0, v2
64+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
65+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v3
66+
; GFX950-SAFE-SDAG-NEXT: v_bfe_u32 v4, v3, 20, 11
67+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
68+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v1, v2, s1, v1
69+
; GFX950-SAFE-SDAG-NEXT: v_sub_u32_e32 v5, 0x3f1, v4
70+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v2, 0x1000, v1
71+
; GFX950-SAFE-SDAG-NEXT: v_med3_i32 v5, v5, 0, 13
72+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v8, v5, v2
73+
; GFX950-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v5, v5, v8
74+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2
75+
; GFX950-SAFE-SDAG-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4
76+
; GFX950-SAFE-SDAG-NEXT: v_lshl_or_b32 v5, v4, 12, v1
77+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
78+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v2, v8, v2
79+
; GFX950-SAFE-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4
80+
; GFX950-SAFE-SDAG-NEXT: s_mov_b32 s0, 0x5040100
81+
; GFX950-SAFE-SDAG-NEXT: s_nop 0
82+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
83+
; GFX950-SAFE-SDAG-NEXT: v_and_b32_e32 v5, 7, v2
84+
; GFX950-SAFE-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
85+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v2
86+
; GFX950-SAFE-SDAG-NEXT: s_nop 0
87+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
88+
; GFX950-SAFE-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
89+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
90+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
91+
; GFX950-SAFE-SDAG-NEXT: v_or_b32_e32 v5, v5, v8
92+
; GFX950-SAFE-SDAG-NEXT: v_add_u32_e32 v2, v2, v5
93+
; GFX950-SAFE-SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4
94+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
95+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
96+
; GFX950-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
97+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
98+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
99+
; GFX950-SAFE-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v4
100+
; GFX950-SAFE-SDAG-NEXT: s_nop 1
101+
; GFX950-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
102+
; GFX950-SAFE-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3
103+
; GFX950-SAFE-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1
104+
; GFX950-SAFE-SDAG-NEXT: v_perm_b32 v0, v1, v0, s0
105+
; GFX950-SAFE-SDAG-NEXT: ; return to shader part epilog
106+
;
107+
; GFX950-SAFE-GISEL-LABEL: v_test_cvt_v2f64_v2f16:
108+
; GFX950-SAFE-GISEL: ; %bb.0:
109+
; GFX950-SAFE-GISEL-NEXT: v_mov_b32_e32 v7, 0x1ff
110+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v0, v1, v7, v0
111+
; GFX950-SAFE-GISEL-NEXT: v_bfe_u32 v4, v1, 20, 11
112+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
113+
; GFX950-SAFE-GISEL-NEXT: v_add_u32_e32 v4, 0xfffffc10, v4
114+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
115+
; GFX950-SAFE-GISEL-NEXT: v_mov_b32_e32 v6, 0xffe
116+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
117+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v0, v5, v6, v0
118+
; GFX950-SAFE-GISEL-NEXT: v_sub_u32_e32 v10, 1, v4
119+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
120+
; GFX950-SAFE-GISEL-NEXT: v_lshl_or_b32 v9, v4, 12, v0
121+
; GFX950-SAFE-GISEL-NEXT: v_med3_i32 v10, v10, 0, 13
122+
; GFX950-SAFE-GISEL-NEXT: v_or_b32_e32 v0, 0x1000, v0
123+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v11, v10, v0
124+
; GFX950-SAFE-GISEL-NEXT: v_lshlrev_b32_e32 v10, v10, v11
125+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
126+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, v10, v0
127+
; GFX950-SAFE-GISEL-NEXT: v_mov_b32_e32 v8, 0x7c00
128+
; GFX950-SAFE-GISEL-NEXT: v_lshl_or_b32 v5, v5, 9, v8
129+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
130+
; GFX950-SAFE-GISEL-NEXT: v_or_b32_e32 v0, v11, v0
131+
; GFX950-SAFE-GISEL-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4
132+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
133+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v2, v3, v7, v2
134+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
135+
; GFX950-SAFE-GISEL-NEXT: v_and_b32_e32 v9, 7, v0
136+
; GFX950-SAFE-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9
137+
; GFX950-SAFE-GISEL-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v9
138+
; GFX950-SAFE-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
139+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v0, 2, v0
140+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1]
141+
; GFX950-SAFE-GISEL-NEXT: v_add_u32_e32 v0, v0, v9
142+
; GFX950-SAFE-GISEL-NEXT: v_cmp_lt_i32_e32 vcc, 30, v4
143+
; GFX950-SAFE-GISEL-NEXT: v_mov_b32_e32 v9, 0x40f
144+
; GFX950-SAFE-GISEL-NEXT: s_nop 0
145+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
146+
; GFX950-SAFE-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9
147+
; GFX950-SAFE-GISEL-NEXT: v_mov_b32_e32 v4, 0x8000
148+
; GFX950-SAFE-GISEL-NEXT: s_nop 0
149+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
150+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v0, v1, v4, v0
151+
; GFX950-SAFE-GISEL-NEXT: v_bfe_u32 v1, v3, 20, 11
152+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
153+
; GFX950-SAFE-GISEL-NEXT: v_add_u32_e32 v1, 0xfffffc10, v1
154+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v3
155+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
156+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v2, v5, v6, v2
157+
; GFX950-SAFE-GISEL-NEXT: v_sub_u32_e32 v7, 1, v1
158+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
159+
; GFX950-SAFE-GISEL-NEXT: v_lshl_or_b32 v6, v1, 12, v2
160+
; GFX950-SAFE-GISEL-NEXT: v_med3_i32 v7, v7, 0, 13
161+
; GFX950-SAFE-GISEL-NEXT: v_or_b32_e32 v2, 0x1000, v2
162+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v10, v7, v2
163+
; GFX950-SAFE-GISEL-NEXT: v_lshlrev_b32_e32 v7, v7, v10
164+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
165+
; GFX950-SAFE-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, v7, v2
166+
; GFX950-SAFE-GISEL-NEXT: v_lshl_or_b32 v5, v5, 9, v8
167+
; GFX950-SAFE-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
168+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
169+
; GFX950-SAFE-GISEL-NEXT: v_or_b32_e32 v2, v10, v2
170+
; GFX950-SAFE-GISEL-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
171+
; GFX950-SAFE-GISEL-NEXT: s_nop 1
172+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
173+
; GFX950-SAFE-GISEL-NEXT: v_and_b32_e32 v6, 7, v2
174+
; GFX950-SAFE-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
175+
; GFX950-SAFE-GISEL-NEXT: v_cmp_lt_i32_e64 s[0:1], 5, v6
176+
; GFX950-SAFE-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
177+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v2, 2, v2
178+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
179+
; GFX950-SAFE-GISEL-NEXT: v_add_u32_e32 v2, v2, v6
180+
; GFX950-SAFE-GISEL-NEXT: v_cmp_lt_i32_e32 vcc, 30, v1
181+
; GFX950-SAFE-GISEL-NEXT: s_nop 1
182+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
183+
; GFX950-SAFE-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9
184+
; GFX950-SAFE-GISEL-NEXT: s_nop 1
185+
; GFX950-SAFE-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
186+
; GFX950-SAFE-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
187+
; GFX950-SAFE-GISEL-NEXT: v_and_or_b32 v1, v2, v4, v1
188+
; GFX950-SAFE-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
189+
; GFX950-SAFE-GISEL-NEXT: ; return to shader part epilog
190+
;
191+
; GFX950-UNSAFE-LABEL: v_test_cvt_v2f64_v2f16:
192+
; GFX950-UNSAFE: ; %bb.0:
193+
; GFX950-UNSAFE-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
194+
; GFX950-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
195+
; GFX950-UNSAFE-NEXT: v_cvt_pk_f16_f32 v0, v0, v2
196+
; GFX950-UNSAFE-NEXT: ; return to shader part epilog
197+
%res = fptrunc <2 x double> %src to <2 x half>
198+
ret <2 x half> %res
199+
}

0 commit comments

Comments
 (0)