Skip to content

Commit 7f88c70

Browse files
RalphASsimonbyrne
authored andcommitted
more thorough conversions (#21)
1 parent 99dc160 commit 7f88c70

File tree

2 files changed

+163
-9
lines changed

2 files changed

+163
-9
lines changed

src/Quadmath.jl

Lines changed: 148 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,31 +122,89 @@ fpinttype(::Type{Float128}) = UInt128
122122
# conversion
123123
Float128(x::Float128) = x
124124

125-
## Float64
125+
# Float64
126126
Float128(x::Float64) =
127127
Float128(@ccall(quadoplib.__extenddftf2(x::Cdouble)::Cfloat128))
128128
Float64(x::Float128) =
129129
@ccall(quadoplib.__trunctfdf2(x::Cfloat128)::Cdouble)
130130

131-
# Int32
131+
# Float32
132+
Float128(x::Float32) =
133+
Float128(@ccall(quadoplib.__extendsftf2(x::Cfloat)::Cfloat128))
134+
Float32(x::Float128) =
135+
@ccall(quadoplib.__trunctfsf2(x::Cfloat128)::Cfloat)
136+
137+
# integer -> Float128
132138
Float128(x::Int32) =
133139
Float128(@ccall(quadoplib.__floatsitf(x::Int32)::Cfloat128))
134-
Int32(x::Float128) =
135-
@ccall(quadoplib.__fixtfsi(x::Cfloat128)::Int32)
136140

137-
# UInt32
138141
Float128(x::UInt32) =
139142
Float128(@ccall(quadoplib.__floatunsitf(x::UInt32)::Cfloat128))
140143

141-
# Int64
142144
Float128(x::Int64) =
143145
Float128(@ccall(quadoplib.__floatditf(x::Int64)::Cfloat128))
144-
Int64(x::Float128) =
145-
@ccall(quadoplib.__fixtfdi(x::Cfloat128)::Int64)
146+
147+
Float128(x::UInt64) =
148+
Float128(@ccall(quadoplib.__floatunditf(x::UInt64)::Cfloat128))
149+
150+
Float128(x::Int16) = Float128(Int32(x))
151+
Float128(x::Int8) = Float128(Int32(x))
152+
Float128(x::UInt16) = Float128(UInt32(x))
153+
Float128(x::UInt8) = Float128(UInt32(x))
154+
155+
function Float128(x::UInt128)
156+
x == 0 && return Float128(0.0)
157+
n = 128-leading_zeros(x) # ndigits0z(x,2)
158+
if n <= 113
159+
y = ((x % UInt128) << (113-n)) & significand_mask(Float128)
160+
else
161+
y = ((x >> (n-114)) % UInt128) & 0x001_ffff_ffff_ffff_ffff_ffff_ffff_ffff # keep 1 extra bit
162+
y = (y+1)>>1 # round, ties up (extra leading bit in case of next exponent)
163+
y &= ~UInt64(trailing_zeros(x) == (n-114)) # fix last bit to round to even
164+
end
165+
d = ((n+16382) % UInt128) << 112
166+
# reinterpret(Float128, d + y)
167+
d += y
168+
if Sys.iswindows()
169+
return reinterpret(Float128,d)
170+
else
171+
y1 = reinterpret(Float64,UInt64(d >> 64))
172+
y2 = reinterpret(Float64,(d % UInt64))
173+
return Float128((y2,y1))
174+
end
175+
end
176+
177+
function Float128(x::Int128)
178+
x == 0 && return 0.0
179+
s = reinterpret(UInt128,x) & sign_mask(Float128) # sign bit
180+
x = abs(x) % UInt128
181+
n = 128-leading_zeros(x) # ndigits0z(x,2)
182+
if n <= 113
183+
y = ((x % UInt128) << (113-n)) & significand_mask(Float128)
184+
else
185+
y = ((x >> (n-114)) % UInt128) & 0x0001_ffff_ffff_ffff_ffff_ffff_ffff_ffff # keep 1 extra bit
186+
y = (y+1)>>1 # round, ties up (extra leading bit in case of next exponent)
187+
y &= ~UInt64(trailing_zeros(x) == (n-114)) # fix last bit to round to even
188+
end
189+
d = ((n+16382) % UInt128) << 112
190+
# reinterpret(Float128, s | d + y)
191+
d = s | d + y
192+
if Sys.iswindows()
193+
return reinterpret(Float128,d)
194+
else
195+
y1 = reinterpret(Float64,UInt64(d >> 64))
196+
y2 = reinterpret(Float64,(d % UInt64))
197+
Float128((y2,y1))
198+
end
199+
end
200+
201+
# Float128 -> integer requires arithmetic, so is below
146202

147203
# Rational
148204
Float128(x::Rational{T}) where T = Float128(numerator(x))/Float128(denominator(x))
149205

206+
Float128(x::Bool) = x ? Float128(1) : Float128(0)
207+
150208
# Comparison
151209
(==)(x::Float128, y::Float128) =
152210
@ccall(quadoplib.__eqtf2(x::Cfloat128, y::Cfloat128)::Cint) == 0
@@ -168,6 +226,85 @@ Float128(x::Rational{T}) where T = Float128(numerator(x))/Float128(denominator(x
168226
(-)(x::Float128) =
169227
Float128(@ccall(quadoplib.__negtf2(x::Cfloat128)::Cfloat128))
170228

229+
# Float128 -> Integer
230+
unsafe_trunc(::Type{Int32}, x::Float128) =
231+
@ccall(quadoplib.__fixtfsi(x::Cfloat128)::Int32)
232+
233+
unsafe_trunc(::Type{Int64}, x::Float128) =
234+
@ccall(quadoplib.__fixtfdi(x::Cfloat128)::Int64)
235+
236+
unsafe_trunc(::Type{UInt32}, x::Float128) =
237+
@ccall(quadoplib.__fixunstfsi(x::Cfloat128)::UInt32)
238+
239+
unsafe_trunc(::Type{UInt64}, x::Float128) =
240+
@ccall(quadoplib.__fixunstfdi(x::Cfloat128)::UInt64)
241+
242+
function unsafe_trunc(::Type{UInt128}, x::Float128)
243+
xu = reinterpret(UInt128,x)
244+
k = (Int64(xu >> 112) & 0x07fff) - 16382 - 113
245+
xu = (xu & significand_mask(Float128)) | 0x0001_0000_0000_0000_0000_0000_0000_0000
246+
if k <= 0
247+
UInt128(xu >> -k)
248+
else
249+
UInt128(xu) << k
250+
end
251+
end
252+
function unsafe_trunc(::Type{Int128}, x::Float128)
253+
copysign(unsafe_trunc(UInt128,x) % Int128, x)
254+
end
255+
trunc(::Type{Signed}, x::Float128) = trunc(Int,x)
256+
trunc(::Type{Unsigned}, x::Float128) = trunc(Int,x)
257+
trunc(::Type{Integer}, x::Float128) = trunc(Int,x)
258+
259+
for Ti in (Int32, Int64, Int128, UInt32, UInt64, UInt128)
260+
let Tf = Float128
261+
if Ti <: Unsigned || sizeof(Ti) < sizeof(Tf)
262+
# Here `Tf(typemin(Ti))-1` is exact, so we can compare the lower-bound
263+
# directly. `Tf(typemax(Ti))+1` is either always exactly representable, or
264+
# rounded to `Inf` (e.g. when `Ti==UInt128 && Tf==Float32`).
265+
@eval begin
266+
function trunc(::Type{$Ti},x::$Tf)
267+
if $(Tf(typemin(Ti))-one(Tf)) < x < $(Tf(typemax(Ti))+one(Tf))
268+
return unsafe_trunc($Ti,x)
269+
else
270+
throw(InexactError(:trunc, $Ti, x))
271+
end
272+
end
273+
function (::Type{$Ti})(x::$Tf)
274+
if ($(Tf(typemin(Ti))) <= x <= $(Tf(typemax(Ti)))) && (round(x, RoundToZero) == x)
275+
return unsafe_trunc($Ti,x)
276+
else
277+
throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
278+
end
279+
end
280+
end
281+
else
282+
# Here `eps(Tf(typemin(Ti))) > 1`, so the only value which can be truncated to
283+
# `Tf(typemin(Ti)` is itself. Similarly, `Tf(typemax(Ti))` is inexact and will
284+
# be rounded up. This assumes that `Tf(typemin(Ti)) > -Inf`, which is true for
285+
# these types, but not for `Float16` or larger integer types.
286+
@eval begin
287+
function trunc(::Type{$Ti},x::$Tf)
288+
if $(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))
289+
return unsafe_trunc($Ti,x)
290+
else
291+
throw(InexactError(:trunc, $Ti, x))
292+
end
293+
end
294+
function (::Type{$Ti})(x::$Tf)
295+
if ($(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))) && (round(x, RoundToZero) == x)
296+
return unsafe_trunc($Ti,x)
297+
else
298+
throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
299+
end
300+
end
301+
end
302+
end
303+
end
304+
end
305+
306+
## math
307+
171308
## one argument
172309
for f in (:acos, :acosh, :asin, :asinh, :atan, :atanh, :cosh, :cos,
173310
:exp, :expm1, :log, :log2, :log10, :log1p,
@@ -180,6 +317,9 @@ end
180317

181318
abs(x::Float128) = Float128(@ccall(libquadmath.fabsq(x::Cfloat128)::Cfloat128))
182319
round(x::Float128) = Float128(@ccall(libquadmath.rintq(x::Cfloat128)::Cfloat128))
320+
round(x::Float128, r::RoundingMode{:Down}) = floor(x)
321+
round(x::Float128, r::RoundingMode{:Up}) = ceil(x)
322+
round(x::Float128, r::RoundingMode{:ToZero}) = round(x)
183323

184324
## two argument
185325
(^)(x::Float128, y::Float128) =

test/runtests.jl

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ using Quadmath
1010
@test z == y
1111
end
1212

13-
@testset "conversion $T" for T in (Float64, Int32, Int64, BigFloat, BigInt)
13+
@testset "conversion $T" for T in (Float32, Float64, Int32, Int64, Int128, UInt32, UInt64, UInt128, BigFloat, BigInt)
1414
@test Float128(T(1)) + Float128(T(2)) == Float128(T(3))
1515
@test Float128(T(1)) + Float128(T(2)) <= Float128(T(3))
1616
@test Float128(T(1)) + Float128(T(2)) != Float128(T(4))
@@ -22,6 +22,20 @@ end
2222
end
2323
end
2424

25+
@testset "conversion $T exceptions" for T in (Int32, Int64, UInt32, UInt64)
26+
x = Float128(typemax(T))
27+
@test_throws InexactError T(x+Float128(1))
28+
x = Float128(typemin(T))
29+
@test_throws InexactError T(x-Float128(1))
30+
end
31+
32+
@testset "conversion $T exceptions" for T in (Float32, Float64)
33+
x = Float128(typemax(T))
34+
@test isinf(T(x+Float128(1)))
35+
x = Float128(typemin(T))
36+
@test isinf(T(x-Float128(1)))
37+
end
38+
2539
@test Base.exponent_one(Float128) == reinterpret(UInt128, Float128(1.0))
2640

2741
@testset "BigFloat" begin

0 commit comments

Comments
 (0)