Skip to content

Commit a9c8e94

Browse files
authored
[DAGCombiner] Extend FP-to-Int cast without requiring nsz (#161093)
This patch updates the FP-to-Int conversion handling: - For signed integers: use `ftrunc` followed by clamping to the target integer range. - For unsigned integers: apply `fabs` + `ftrunc`, then clamp. This removes the previous dependence on `nsz` and ensures correct lowering for both signed and unsigned cases. I've tested the code generation of -mtriple=amdgcn. It seems that the assembly code is expected, but I'm not sure how to write a general testcase for every target. Fixes #160623.
1 parent 93d3260 commit a9c8e94

File tree

3 files changed

+330
-11
lines changed

3 files changed

+330
-11
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18870,27 +18870,38 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
1887018870

1887118871
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1887218872
const TargetLowering &TLI) {
18873-
// We only do this if the target has legal ftrunc. Otherwise, we'd likely be
18874-
// replacing casts with a libcall. We also must be allowed to ignore -0.0
18875-
// because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
18876-
// conversions would return +0.0.
18873+
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
18874+
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
18875+
// Otherwise, for strict math, we must handle edge cases:
18876+
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
18877+
// as example, it first becomes integer 0, and is converted back to +0.0.
18878+
// FTRUNC on its own could produce -0.0.
18879+
1887718880
// FIXME: We should be able to use node-level FMF here.
18878-
// TODO: If strict math, should we use FABS (+ range check for signed cast)?
1887918881
EVT VT = N->getValueType(0);
18880-
if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
18881-
!DAG.getTarget().Options.NoSignedZerosFPMath)
18882+
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
1888218883
return SDValue();
1888318884

1888418885
// fptosi/fptoui round towards zero, so converting from FP to integer and
1888518886
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
1888618887
SDValue N0 = N->getOperand(0);
1888718888
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
18888-
N0.getOperand(0).getValueType() == VT)
18889-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18889+
N0.getOperand(0).getValueType() == VT) {
18890+
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18891+
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18892+
}
1889018893

1889118894
if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
18892-
N0.getOperand(0).getValueType() == VT)
18893-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18895+
N0.getOperand(0).getValueType() == VT) {
18896+
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18897+
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18898+
18899+
// Strict math: use FABS to handle negative inputs correctly.
18900+
if (TLI.isFAbsFree(VT)) {
18901+
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
18902+
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
18903+
}
18904+
}
1889418905

1889518906
return SDValue();
1889618907
}

llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ define half @t3(half %x) {
7070
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
7171
; CHECK-NEXT: ret
7272
;
73+
; USE-NEON-NO-GPRS-LABEL: t3:
74+
; USE-NEON-NO-GPRS: // %bb.0: // %entry
75+
; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0
76+
; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0
77+
; USE-NEON-NO-GPRS-NEXT: ret
78+
;
7379
; NONEON-NOSVE-LABEL: t3:
7480
; NONEON-NOSVE: // %bb.0: // %entry
7581
; NONEON-NOSVE-NEXT: fcvt s0, h0
@@ -147,6 +153,12 @@ define half @t6(half %x) {
147153
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
148154
; CHECK-NEXT: ret
149155
;
156+
; USE-NEON-NO-GPRS-LABEL: t6:
157+
; USE-NEON-NO-GPRS: // %bb.0: // %entry
158+
; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0
159+
; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0
160+
; USE-NEON-NO-GPRS-NEXT: ret
161+
;
150162
; NONEON-NOSVE-LABEL: t6:
151163
; NONEON-NOSVE: // %bb.0: // %entry
152164
; NONEON-NOSVE-NEXT: fcvt s0, h0
Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
4+
5+
define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) {
6+
; GFX6-LABEL: fptoui_f32_to_i16_to_f32:
7+
; GFX6: ; %bb.0: ; %entry
8+
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
9+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
10+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
11+
; GFX6-NEXT: s_mov_b32 s2, -1
12+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
13+
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
14+
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
15+
; GFX6-NEXT: s_endpgm
16+
;
17+
; GFX9-LABEL: fptoui_f32_to_i16_to_f32:
18+
; GFX9: ; %bb.0: ; %entry
19+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
20+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
21+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
22+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
23+
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
24+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
25+
; GFX9-NEXT: s_endpgm
26+
entry:
27+
%ui = fptoui float %x to i16
28+
%fp = uitofp i16 %ui to float
29+
store float %fp, ptr addrspace(1) %out
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) {
34+
; GFX6-LABEL: fptoui_f32_to_i32_to_f32:
35+
; GFX6: ; %bb.0: ; %entry
36+
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
37+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
38+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
39+
; GFX6-NEXT: s_mov_b32 s2, -1
40+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
41+
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
42+
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
43+
; GFX6-NEXT: s_endpgm
44+
;
45+
; GFX9-LABEL: fptoui_f32_to_i32_to_f32:
46+
; GFX9: ; %bb.0: ; %entry
47+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
48+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
49+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
50+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
51+
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
52+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
53+
; GFX9-NEXT: s_endpgm
54+
entry:
55+
%ui = fptoui float %x to i32
56+
%fp = uitofp i32 %ui to float
57+
store float %fp, ptr addrspace(1) %out
58+
ret void
59+
}
60+
61+
define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) {
62+
; GFX6-LABEL: fptoui_f32_to_i64_to_f32:
63+
; GFX6: ; %bb.0: ; %entry
64+
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
65+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
66+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
67+
; GFX6-NEXT: s_mov_b32 s2, -1
68+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
69+
; GFX6-NEXT: v_trunc_f32_e64 v0, |s6|
70+
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
71+
; GFX6-NEXT: s_endpgm
72+
;
73+
; GFX9-LABEL: fptoui_f32_to_i64_to_f32:
74+
; GFX9: ; %bb.0: ; %entry
75+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
76+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
77+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
78+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
79+
; GFX9-NEXT: v_trunc_f32_e64 v1, |s2|
80+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
81+
; GFX9-NEXT: s_endpgm
82+
entry:
83+
%ui = fptoui float %x to i64
84+
%fp = uitofp i64 %ui to float
85+
store float %fp, ptr addrspace(1) %out
86+
ret void
87+
}
88+
89+
define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) {
90+
; GFX6-LABEL: fptoui_f16_to_i16_to_f16:
91+
; GFX6: ; %bb.0: ; %entry
92+
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
93+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
94+
; GFX6-NEXT: s_mov_b32 s2, -1
95+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
96+
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
97+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
98+
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
99+
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
100+
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
101+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
102+
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
103+
; GFX6-NEXT: s_endpgm
104+
;
105+
; GFX9-LABEL: fptoui_f16_to_i16_to_f16:
106+
; GFX9: ; %bb.0: ; %entry
107+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
108+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
109+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
110+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111+
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
112+
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
113+
; GFX9-NEXT: s_endpgm
114+
entry:
115+
%ui = fptoui half %x to i16
116+
%fp = uitofp i16 %ui to half
117+
store half %fp, ptr addrspace(1) %out
118+
ret void
119+
}
120+
121+
define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) {
122+
; GFX6-LABEL: fptoui_f16_to_i32_to_f16:
123+
; GFX6: ; %bb.0: ; %entry
124+
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
125+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
126+
; GFX6-NEXT: s_mov_b32 s2, -1
127+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
128+
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
129+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
130+
; GFX6-NEXT: v_trunc_f32_e32 v0, v0
131+
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
132+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
133+
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
134+
; GFX6-NEXT: s_endpgm
135+
;
136+
; GFX9-LABEL: fptoui_f16_to_i32_to_f16:
137+
; GFX9: ; %bb.0: ; %entry
138+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
139+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
140+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
141+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
142+
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
143+
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
144+
; GFX9-NEXT: s_endpgm
145+
entry:
146+
%ui = fptoui half %x to i32
147+
%fp = uitofp i32 %ui to half
148+
store half %fp, ptr addrspace(1) %out
149+
ret void
150+
}
151+
152+
define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) {
153+
; GFX6-LABEL: fptoui_f16_to_i64_to_f16:
154+
; GFX6: ; %bb.0: ; %entry
155+
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
156+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
157+
; GFX6-NEXT: s_mov_b32 s2, -1
158+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
159+
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0|
160+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
161+
; GFX6-NEXT: v_trunc_f32_e32 v0, v0
162+
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
163+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
164+
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
165+
; GFX6-NEXT: s_endpgm
166+
;
167+
; GFX9-LABEL: fptoui_f16_to_i64_to_f16:
168+
; GFX9: ; %bb.0: ; %entry
169+
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
170+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
171+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
172+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
173+
; GFX9-NEXT: v_trunc_f16_e64 v1, |s2|
174+
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
175+
; GFX9-NEXT: s_endpgm
176+
entry:
177+
%ui = fptoui half %x to i64
178+
%fp = uitofp i64 %ui to half
179+
store half %fp, ptr addrspace(1) %out
180+
ret void
181+
}
182+
183+
define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) {
184+
; GFX6-LABEL: fptoui_f64_to_i16_to_f64:
185+
; GFX6: ; %bb.0: ; %entry
186+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
187+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
188+
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
189+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
190+
; GFX6-NEXT: s_mov_b32 s2, -1
191+
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
192+
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
193+
; GFX6-NEXT: s_endpgm
194+
;
195+
; GFX9-LABEL: fptoui_f64_to_i16_to_f64:
196+
; GFX9: ; %bb.0: ; %entry
197+
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
198+
; GFX9-NEXT: v_mov_b32_e32 v2, 0
199+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
200+
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
201+
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
202+
; GFX9-NEXT: s_endpgm
203+
entry:
204+
%ui = fptoui double %x to i16
205+
%fp = uitofp i16 %ui to double
206+
store double %fp, ptr addrspace(1) %out
207+
ret void
208+
}
209+
210+
define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) {
211+
; GFX6-LABEL: fptoui_f64_to_i32_to_f64:
212+
; GFX6: ; %bb.0: ; %entry
213+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
214+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
215+
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3]
216+
; GFX6-NEXT: s_mov_b32 s3, 0xf000
217+
; GFX6-NEXT: s_mov_b32 s2, -1
218+
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
219+
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
220+
; GFX6-NEXT: s_endpgm
221+
;
222+
; GFX9-LABEL: fptoui_f64_to_i32_to_f64:
223+
; GFX9: ; %bb.0: ; %entry
224+
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
225+
; GFX9-NEXT: v_mov_b32_e32 v2, 0
226+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
227+
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
228+
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
229+
; GFX9-NEXT: s_endpgm
230+
entry:
231+
%ui = fptoui double %x to i32
232+
%fp = uitofp i32 %ui to double
233+
store double %fp, ptr addrspace(1) %out
234+
ret void
235+
}
236+
237+
define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) {
238+
; GFX6-LABEL: fptoui_f64_to_i64_to_f64:
239+
; GFX6: ; %bb.0: ; %entry
240+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
241+
; GFX6-NEXT: s_mov_b32 s6, -1
242+
; GFX6-NEXT: s_mov_b32 s5, 0xfffff
243+
; GFX6-NEXT: s_mov_b32 s4, s6
244+
; GFX6-NEXT: v_not_b32_e32 v0, 31
245+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
246+
; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014
247+
; GFX6-NEXT: s_addk_i32 s7, 0xfc01
248+
; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
249+
; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000
250+
; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5]
251+
; GFX6-NEXT: s_cmp_lt_i32 s7, 0
252+
; GFX6-NEXT: s_cselect_b32 s4, 0, s4
253+
; GFX6-NEXT: s_cselect_b32 s5, s8, s5
254+
; GFX6-NEXT: s_cmp_gt_i32 s7, 51
255+
; GFX6-NEXT: s_cselect_b32 s3, s3, s5
256+
; GFX6-NEXT: s_cselect_b32 s2, s2, s4
257+
; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0
258+
; GFX6-NEXT: v_mov_b32_e32 v4, -1
259+
; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1]
260+
; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff
261+
; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
262+
; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3
263+
; GFX6-NEXT: s_mov_b32 s4, 0
264+
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
265+
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
266+
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
267+
; GFX6-NEXT: v_mov_b32_e32 v2, s2
268+
; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000
269+
; GFX6-NEXT: v_mov_b32_e32 v3, s3
270+
; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3]
271+
; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
272+
; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3]
273+
; GFX6-NEXT: s_mov_b32 s7, 0xf000
274+
; GFX6-NEXT: s_mov_b32 s4, s0
275+
; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
276+
; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2
277+
; GFX6-NEXT: s_mov_b32 s5, s1
278+
; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
279+
; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
280+
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
281+
; GFX6-NEXT: s_endpgm
282+
;
283+
; GFX9-LABEL: fptoui_f64_to_i64_to_f64:
284+
; GFX9: ; %bb.0: ; %entry
285+
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
286+
; GFX9-NEXT: v_mov_b32_e32 v2, 0
287+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
288+
; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]|
289+
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
290+
; GFX9-NEXT: s_endpgm
291+
entry:
292+
%ui = fptoui double %x to i64
293+
%fp = uitofp i64 %ui to double
294+
store double %fp, ptr addrspace(1) %out
295+
ret void
296+
}

0 commit comments

Comments
 (0)