Skip to content

Commit 8665610

Browse files
committed
Base: correctly rounded floats constructed from rationals
Constructing a floating-point number from a `Rational` should now be correctly rounded. Implementation approach: 1. Convert the (numerator, denominator) pair to a (sign bit, integral significand, exponent) triplet using integer arithmetic. The integer type in question must be wide enough. 2. Convert the above triplet into an instance of the chosen FP type. There is special support for IEEE 754 floating-point and for `BigFloat`, otherwise a fallback using `ldexp` is used. As a bonus, constructing a `BigFloat` from a `Rational` should now be thread-safe when the rounding mode and precision are provided to the constructor, because there is no access to the global precision or rounding mode settings. Updates #45213 Updates #50940 Updates #52507 Fixes #52394 Closes #52395 Fixes #52859
1 parent fc6295d commit 8665610

File tree

8 files changed

+781
-11
lines changed

8 files changed

+781
-11
lines changed

base/Base.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ include("rounding.jl")
253253
include("div.jl")
254254
include("rawbigints.jl")
255255
include("float.jl")
256+
include("rational_to_float.jl")
256257
include("twiceprecision.jl")
257258
include("complex.jl")
258259
include("rational.jl")

base/docs/basedocs.jl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2777,6 +2777,25 @@ julia> 4/2
27772777
julia> 4.5/2
27782778
2.25
27792779
```
2780+
2781+
This function may convert integer arguments to a floating-point number type
2782+
([`AbstractFloat`](@ref)), potentially resulting in a loss of accuracy. To avoid this,
2783+
instead construct a [`Rational`](@ref) from the arguments, then convert the resulting
2784+
rational number to a specific floating-point type of your choice:
2785+
2786+
```jldoctest
2787+
julia> n = 100000000000000000
2788+
100000000000000000
2789+
2790+
julia> m = n + 6
2791+
100000000000000006
2792+
2793+
julia> n/m
2794+
1.0
2795+
2796+
julia> Float64(n//m) # `//` constructs a `Rational`
2797+
0.9999999999999999
2798+
```
27802799
"""
27812800
/(x, y)
27822801

base/mpfr.jl

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ import
1919
isone, big, _string_n, decompose, minmax,
2020
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
2121
uinttype, exponent_max, exponent_min, ieee754_representation, significand_mask,
22-
RawBigIntRoundingIncrementHelper, truncated, RawBigInt
22+
RawBigIntRoundingIncrementHelper, truncated, RawBigInt, RationalToFloat,
23+
rational_to_floating_point
2324

2425

2526
using .Base.Libc
@@ -310,12 +311,51 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
310311
BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
311312
BigFloat(Float64(x), r; precision=precision)
312313

313-
function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
314-
setprecision(BigFloat, precision) do
315-
setrounding_raw(BigFloat, r) do
316-
BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
314+
function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
315+
ccall(
316+
(:mpfr_set_z_2exp, libmpfr),
317+
Int32,
318+
(Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
319+
z, n, exp, rm,
320+
)
321+
nothing
322+
end
323+
324+
function RationalToFloat.to_floating_point_impl(::Type{BigFloat}, ::Type{BigInt}, num, den, romo, prec)
325+
num_is_zero = iszero(num)
326+
den_is_zero = iszero(den)
327+
s = Int8(sign(num))
328+
sb = signbit(s)
329+
is_zero = num_is_zero & !den_is_zero
330+
is_inf = !num_is_zero & den_is_zero
331+
is_regular = !num_is_zero & !den_is_zero
332+
333+
if is_regular
334+
let rtfc = RationalToFloat.to_float_components
335+
c = rtfc(BigInt, num, den, prec, nothing, romo, sb)
336+
ret = BigFloat(precision = prec)
337+
mpfr_romo = convert(MPFRRoundingMode, romo)
338+
set_2exp!(ret, s * c.integral_significand, Int(c.exponent - prec + true), mpfr_romo)
339+
ret
317340
end
318-
end
341+
else
342+
if is_zero
343+
BigFloat(false, MPFRRoundToZero, precision = prec)
344+
elseif is_inf
345+
BigFloat(s * Inf, MPFRRoundToZero, precision = prec)
346+
else
347+
BigFloat(precision = prec)
348+
end
349+
end::BigFloat
350+
end
351+
352+
function BigFloat(x::Rational, r::RoundingMode; precision::Integer = DEFAULT_PRECISION[])
353+
rational_to_floating_point(BigFloat, x, r, precision)
354+
end
355+
356+
function BigFloat(x::Rational, r::MPFRRoundingMode = ROUNDING_MODE[];
357+
precision::Integer = DEFAULT_PRECISION[])
358+
rational_to_floating_point(BigFloat, x, r, precision)
319359
end
320360

321361
function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])

base/rational.jl

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,18 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
140140
(::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
141141
throw(InexactError(nameof(T), T, x)))
142142

143-
AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
144-
function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
145-
P = promote_type(T,S)
146-
convert(T, convert(P,x.num)/convert(P,x.den))::T
143+
function rational_to_floating_point(::Type{F}, x, rm, prec) where {F}
144+
nd = (numerator(x), denominator(x))
145+
RationalToFloat.to_floating_point(F, nd..., rm, prec)::F
146+
end
147+
148+
function (::Type{F})(x::Rational, rm::RoundingMode = RoundNearest) where {F<:AbstractFloat}
149+
rational_to_floating_point(F, x, rm, precision(F))::F
150+
end
151+
152+
function AbstractFloat(x::Q) where {Q<:Rational}
153+
T = float(Q)
154+
T(x)::T::AbstractFloat
147155
end
148156

149157
function Rational{T}(x::AbstractFloat) where T<:Integer

base/rational_to_float.jl

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# This file is a part of Julia. License is MIT: https://julialang.org/license
2+
3+
module RationalToFloat
4+
5+
# Performance optimization. Unlike raw `<<` or `>>>`, this is supposed
6+
# to compile to a single instruction, because the semantics correspond
7+
# to what hardware usually provides.
8+
function machine_shift(shift::S, a::T, b) where {S,T<:Base.BitInteger}
9+
@inline begin
10+
mask = 8*sizeof(T) - 1
11+
c = b & mask
12+
shift(a, c)
13+
end
14+
end
15+
16+
machine_shift(::S, a::Bool, ::Any) where {S} = error("unsupported")
17+
18+
# Fallback for `BigInt` etc.
19+
machine_shift(shift::S, a, b) where {S} = shift(a, b)
20+
21+
# Arguments are positive integers.
22+
function div_significand_with_remainder(num, den, minimum_significand_size)
23+
clamped = x -> max(zero(x), x)::typeof(x)
24+
bw = Base.top_set_bit # bit width
25+
shift = clamped(minimum_significand_size + bw(den) - bw(num) + 0x2)
26+
t = machine_shift(<<, num, shift)
27+
(divrem(t, den, RoundToZero)..., shift)
28+
end
29+
30+
# `divrem(n, 1<<k, RoundToZero)`
31+
function divrem_2(n, k)
32+
quo = machine_shift(>>>, n, k)
33+
tmp = machine_shift(<<, quo, k)
34+
rem = n - tmp
35+
(quo, rem)
36+
end
37+
38+
function to_float_components_impl(num, den, precision, max_subnormal_exp)
39+
# `+1` because we need an extra, "round", bit for some rounding modes.
40+
#
41+
# TODO: as a performance optimization, only do this when required
42+
# by the rounding mode
43+
prec_p_1 = precision + true
44+
45+
(quo0, rem0, shift) = div_significand_with_remainder(num, den, prec_p_1)
46+
width = Base.top_set_bit(quo0)
47+
excess_width = width - prec_p_1
48+
exp = width - shift - true
49+
50+
exp_underflow = if isnothing(max_subnormal_exp)
51+
zero(exp)
52+
else
53+
let d = max_subnormal_exp - exp, T = typeof(d), z = zero(d)::T
54+
(signbit(d) ? z : d + true)::T
55+
end
56+
end
57+
58+
(quo1, rem1) = divrem_2(quo0, exp_underflow + excess_width)
59+
integral_significand = quo1 >>> true
60+
round_bit = quo1 % Bool
61+
sticky_bit = !iszero(rem1) | !iszero(rem0)
62+
63+
(; integral_significand, exponent = exp, round_bit, sticky_bit)
64+
end
65+
66+
struct RoundingIncrementHelper
67+
final_bit::Bool
68+
round_bit::Bool
69+
sticky_bit::Bool
70+
end
71+
72+
(h::RoundingIncrementHelper)(::Base.Rounding.FinalBit) = h.final_bit
73+
(h::RoundingIncrementHelper)(::Base.Rounding.RoundBit) = h.round_bit
74+
(h::RoundingIncrementHelper)(::Base.Rounding.StickyBit) = h.sticky_bit
75+
76+
function to_float_components_rounded(num, den, precision, max_subnormal_exp, romo, sign_bit)
77+
overflows = (x, p) -> x == machine_shift(<<, one(x), p)
78+
t = to_float_components_impl(num, den, precision, max_subnormal_exp)
79+
raw_significand = t.integral_significand
80+
rh = RoundingIncrementHelper(raw_significand % Bool, t.round_bit, t.sticky_bit)
81+
incr = Base.Rounding.correct_rounding_requires_increment(rh, romo, sign_bit)
82+
rounded = raw_significand + incr
83+
(integral_significand, exponent) = let exp = t.exponent
84+
if overflows(rounded, precision)
85+
(rounded >>> true, exp + true)
86+
else
87+
(rounded, exp)
88+
end
89+
end
90+
(; integral_significand, exponent)
91+
end
92+
93+
function to_float_components(::Type{T}, num, den, precision, max_subnormal_exp, romo, sb) where {T}
94+
to_float_components_rounded(abs(T(num)), den, precision, max_subnormal_exp, romo, sb)
95+
end
96+
97+
function to_floating_point_fallback(::Type{T}, ::Type{S}, num, den, rm, prec) where {T,S}
98+
num_is_zero = iszero(num)
99+
den_is_zero = iszero(den)
100+
sb = signbit(num)
101+
is_zero = num_is_zero & !den_is_zero
102+
is_inf = !num_is_zero & den_is_zero
103+
is_regular = !num_is_zero & !den_is_zero
104+
if is_regular
105+
let
106+
c = to_float_components(S, num, den, prec, nothing, rm, sb)
107+
signif = T(c.integral_significand)::T
108+
x = ldexp(signif, c.exponent - prec + true)::T
109+
sb ? -x : x
110+
end::T
111+
else
112+
if is_zero
113+
zero(T)::T
114+
elseif is_inf
115+
T(Inf)::T
116+
else
117+
T(NaN)::T
118+
end
119+
end::T
120+
end
121+
122+
function to_floating_point_impl(::Type{T}, ::Type{S}, num, den, rm, prec) where {T,S}
123+
to_floating_point_fallback(T, S, num, den, rm, prec)
124+
end
125+
126+
function to_floating_point_impl(::Type{T}, ::Type{S}, num, den, rm, prec) where {T<:Base.IEEEFloat,S}
127+
num_is_zero = iszero(num)
128+
den_is_zero = iszero(den)
129+
sb = signbit(num)
130+
is_zero = num_is_zero & !den_is_zero
131+
is_inf = !num_is_zero & den_is_zero
132+
is_regular = !num_is_zero & !den_is_zero
133+
Rnd = Base.Rounding
134+
(rm_is_to_zero, rm_is_from_zero) = if Rnd.rounds_to_nearest(rm)
135+
(false, false)
136+
else
137+
let from = Rnd.rounds_away_from_zero(rm, sb)
138+
(!from, from)
139+
end
140+
end::NTuple{2,Bool}
141+
exp_max = Base.exponent_max(T)
142+
exp_min = Base.exponent_min(T)
143+
ieee_repr = Base.ieee754_representation
144+
repr_zero = ieee_repr(T, sb, Val(:zero))
145+
repr_inf = ieee_repr(T, sb, Val(:inf))
146+
repr_nan = ieee_repr(T, sb, Val(:nan))
147+
U = typeof(repr_zero)
148+
repr_zero::U
149+
repr_inf::U
150+
repr_nan::U
151+
152+
ret_u = if is_regular
153+
let
154+
c = to_float_components(S, num, den, prec, exp_min - 1, rm, sb)
155+
exp = c.exponent
156+
exp_diff = exp - exp_min
157+
is_normal = 0 exp_diff
158+
exp_is_huge_p = exp_max < exp
159+
exp_is_huge_n = signbit(exp_diff + prec)
160+
rounds_to_inf = exp_is_huge_p & !rm_is_to_zero
161+
rounds_to_zero = exp_is_huge_n & !rm_is_from_zero
162+
163+
if !rounds_to_zero & !exp_is_huge_p
164+
let signif = (c.integral_significand % U) & Base.significand_mask(T)
165+
exp_field = (max(exp_diff, zero(exp_diff)) + is_normal) % U
166+
ieee_repr(T, sb, exp_field, signif)::U
167+
end
168+
elseif rounds_to_zero
169+
repr_zero
170+
elseif rounds_to_inf
171+
repr_inf
172+
else
173+
ieee_repr(T, sb, Val(:omega))
174+
end
175+
end
176+
else
177+
if is_zero
178+
repr_zero
179+
elseif is_inf
180+
repr_inf
181+
else
182+
repr_nan
183+
end
184+
end::U
185+
186+
reinterpret(T, ret_u)::T
187+
end
188+
189+
# `BigInt` is a safe default.
190+
to_float_promote_type(::Type{F}, ::Type{S}) where {F,S} = BigInt
191+
192+
const BitIntegerOrBool = Union{Bool,Base.BitInteger}
193+
194+
# As an optimization, use an integer type narrower than `BigInt` when possible.
195+
function to_float_promote_type(::Type{F}, ::Type{S}) where {F<:Base.IEEEFloat,S<:BitIntegerOrBool}
196+
Max = if sizeof(F) sizeof(S)
197+
S
198+
else
199+
(S <: Signed) ? Base.inttype(F) : Base.uinttype(F)
200+
end
201+
widen(Max)
202+
end
203+
204+
function to_floating_point(::Type{F}, num::T, den::T, rm, prec) where {F,T}
205+
S = to_float_promote_type(F, T)
206+
to_floating_point_impl(F, S, num, den, rm, prec)
207+
end
208+
209+
end

test/choosetests.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ const TESTNAMES = [
1111
"char", "strings", "triplequote", "unicode", "intrinsics",
1212
"dict", "hashing", "iobuffer", "staged", "offsetarray",
1313
"arrayops", "tuple", "reduce", "reducedim", "abstractarray",
14-
"intfuncs", "simdloop", "vecelement", "rational",
14+
"intfuncs", "simdloop", "vecelement", "rational", "rational_to_float",
1515
"bitarray", "copy", "math", "fastmath", "functional", "iterators",
1616
"operators", "ordering", "path", "ccall", "parse", "loading", "gmp",
1717
"sorting", "spawn", "backtrace", "exceptions",

0 commit comments

Comments
 (0)