Skip to content

Commit 28d0c8e

Browse files
committed
Base: make constructing a float from a rational exact
Two example bugs that are now fixed: ```julia-repl julia> Float16(9//70000) # incorrect because `Float16(70000)` overflows Float16(0.0) julia> Float16(big(9//70000)) # correct because of promotion to `BigFloat` Float16(0.0001286) julia> Float32(16777216//16777217) < 1 # `16777217` doesn't fit in a `Float32` mantissa false ``` Another way to fix this would have been to convert the numerator and denominator into `BigFloat` exactly and then divide one with the other. However, the requirement for exactness means that the `BigFloat` precision would need to be manipulated, something that seems to be problematic in Julia. So implement the division using integer arithmetic. As a bonus, constructing a `BigFloat` from a `Rational` is now thread-safe when the rounding mode and precision are provided to the constructor. Updates #45213
1 parent c245179 commit 28d0c8e

File tree

4 files changed

+438
-10
lines changed

4 files changed

+438
-10
lines changed

base/float.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ for T in (Float16, Float32, Float64)
113113
@eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
114114
end
115115

116+
# The minimum exponent for a normal number.
117+
min_normal_number_exponent(::Type{<:AbstractFloat}) = nothing
118+
min_normal_number_exponent(::Type{F}) where {F<:IEEEFloat} = true - exponent_bias(F)
119+
116120
"""
117121
exponent_max(T)
118122

base/mpfr.jl

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ import
1717
cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
1818
setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
1919
isone, big, _string_n, decompose, minmax,
20-
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
20+
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
21+
rational_to_float_components, rational_bool_to_float, bit_width,
22+
rounding_mode_translated_for_abs
2123

2224
import ..Rounding: rounding_raw, setrounding_raw
2325

@@ -280,12 +282,63 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
280282
BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
281283
BigFloat(Float64(x), r; precision=precision)
282284

283-
function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
284-
setprecision(BigFloat, precision) do
285-
setrounding_raw(BigFloat, r) do
286-
BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
287-
end
285+
function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
286+
ccall(
287+
(:mpfr_set_z_2exp, libmpfr),
288+
Int32,
289+
(Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
290+
z, n, exp, rm,
291+
)
292+
nothing
293+
end
294+
295+
function rational_to_bigfloat(x::Rational, romo::MPFRRoundingMode, prec::Integer)
296+
s = Int8(sign(numerator(x)))
297+
a = abs(x)
298+
299+
num = numerator(a)
300+
den = denominator(a)
301+
302+
# Handle special cases
303+
num_is_zero = iszero(num)
304+
den_is_zero = iszero(den)
305+
if den_is_zero
306+
num_is_zero && return BigFloat(precision = prec) # 0/0
307+
# n/0 = Inf * sign(n)
308+
#
309+
# The rounding mode doesn't matter for infinity.
310+
return BigFloat(s * Inf, MPFRRoundToZero, precision = prec)
311+
end
312+
# 0/1 = 0
313+
#
314+
# The rounding mode doesn't matter for zero.
315+
num_is_zero && return BigFloat(false, MPFRRoundToZero, precision = prec)
316+
317+
rm = rounding_mode_translated_for_abs(convert(RoundingMode, romo), s)
318+
c = rational_to_float_components(num, den, prec, BigInt, rm)
319+
bw = bit_width(c.mantissa)
320+
ret = BigFloat(precision = prec)
321+
322+
# The rounding mode doesn't matter because there shouldn't be any
323+
# rounding, as MPFR doesn't have subnormals.
324+
set_2exp!(ret, s * c.mantissa, Int(c.exponent - bw + true), MPFRRoundToZero)
325+
326+
ret
327+
end
328+
329+
function rational_to_bigfloat(x::Rational{Bool}, ::MPFRRoundingMode, prec::Integer)
330+
# the rounding mode doesn't matter for conversion from booleans
331+
conv = let p = prec
332+
y -> BigFloat(y, MPFRRoundToZero, precision = p)
288333
end
334+
rational_bool_to_float(conv, x)::BigFloat
335+
end
336+
337+
function BigFloat(x::Rational,
338+
r::MPFRRoundingMode = ROUNDING_MODE[];
339+
precision::Integer = DEFAULT_PRECISION[])
340+
y = Rational(promote(numerator(x), denominator(x))...)
341+
rational_to_bigfloat(y, r, precision)::BigFloat
289342
end
290343

291344
function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])

base/rational.jl

Lines changed: 212 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,220 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
129129
(::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
130130
throw(InexactError(nameof(T), T, x)))
131131

132-
AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
133-
function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
134-
P = promote_type(T,S)
135-
convert(T, convert(P,x.num)/convert(P,x.den))::T
132+
bit_width(n) = ndigits(n, base = UInt8(2), pad = false)
133+
134+
function divrem_pow2(num::Integer, n::Integer)
135+
quot = num >> n
136+
rema = num - (quot << n)
137+
(quot, rema)
138+
end
139+
140+
const RoundingModesIndependentFromSign = Union{
141+
RoundingMode{:ToZero}, RoundingMode{:FromZero},
142+
RoundingMode{:Nearest}, RoundingMode{:NearestTiesAway}}
143+
144+
rounding_mode_translated_for_abs(rm::RoundingModesIndependentFromSign, ::Real) = rm
145+
146+
function rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real)
147+
!signbit(sign) ? RoundToZero : RoundFromZero
148+
end
149+
150+
function rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real)
151+
!signbit(sign) ? RoundFromZero : RoundToZero
152+
end
153+
154+
clamped_to_zero(x) = max(zero(x), x)
155+
156+
struct FloatComponentsResult{S<:Integer,T<:Integer}
157+
mantissa::S
158+
exponent::T
159+
inexact::Bool
160+
underflowed::Bool
161+
162+
function FloatComponentsResult(m::S, e::T;
163+
inexact::Bool,
164+
underflowed::Bool) where {S<:Integer,T<:Integer}
165+
new{S,T}(m, e, inexact, underflowed)
166+
end
167+
end
168+
169+
# `num`, `den` are positive integers. `requested_precision` is the
170+
# requested floating-point precision. `T` is the integer type that
171+
# we'll be working with mostly, it needs to be wide enough.
172+
# `exp_lower_bound` is the minimum allowed normalized exponent for
173+
# normal numbers.
174+
function rational_to_float_components_impl(num::Integer,
175+
den::Integer,
176+
requested_precision::Integer,
177+
::Type{T},
178+
romo::RoundingMode,
179+
exp_lower_bound::Union{Nothing,Integer}) where
180+
{T<:Integer}
181+
num_bit_width = bit_width(num)
182+
den_bit_width = bit_width(den)
183+
184+
# `T` must be wide enough to avoid overflow.
185+
numT = T(num)
186+
denT = T(den)
187+
188+
# Creates a mantissa in `quo_` that's at least
189+
# `requested_precision` bits wide.
190+
bit_shift_num_ = clamped_to_zero(den_bit_width - num_bit_width + requested_precision)
191+
quo_ = div(numT << bit_shift_num_, den, RoundToZero)
192+
193+
# Nonnegative. Experiments indicate that, when iterating over all
194+
# possible numerators and denominators below some bit width, with
195+
# some fixed value for `requested_precision`, the maximal attained
196+
# value will be
197+
# `max(1, max(num_bit_width, den_bit_width) - requested_precision)`.
198+
# So in the worst case we have `requested_precision == 1` and
199+
# `excess_precision == max(1, max(num_bit_width, den_bit_width) - 1)`
200+
excess_precision = bit_width(quo_) - requested_precision
201+
202+
# Normalized exponent
203+
nexp = bit_width(quo_) - bit_shift_num_ - true
204+
205+
# Take possible underflow into account: if there's underflow, the
206+
# precision needs to be less than requested.
207+
adjusted_precision = requested_precision
208+
if !isnothing(exp_lower_bound)
209+
underflow = clamped_to_zero(exp_lower_bound - nexp)
210+
adjusted_precision -= underflow
211+
end
212+
adjusted_excess_precision = bit_width(quo_) - adjusted_precision
213+
214+
(adjusted_precision < false) && return FloatComponentsResult(
215+
zero(quo_), zero(nexp), inexact = true, underflowed = true)
216+
217+
# Creates a mantissa in `quo` that's exactly `adjusted_precision`
218+
# bits wide, except if rounding up happens, in which case the bit
219+
# width may be `adjusted_precision + 1`.
220+
bit_shift_num = clamped_to_zero(bit_shift_num_ - adjusted_excess_precision)
221+
bit_shift_den = adjusted_excess_precision - (bit_shift_num_ - bit_shift_num) # nonnegative
222+
(quo, rem) = divrem(numT << bit_shift_num, denT << bit_shift_den, romo)
223+
224+
result_is_exact = iszero(rem)
225+
226+
nexp_final = nexp + (adjusted_precision < bit_width(quo))
227+
is_subnormal = !isnothing(exp_lower_bound) && (nexp_final < exp_lower_bound)
228+
229+
iszero(quo) || (quo >>= trailing_zeros(quo))
230+
231+
# The bit width of `quo` is now less than or equal to
232+
# `adjusted_precision`, except if `adjusted_precision` is zero, in
233+
# which case `bit_width(quo)` may be one, in case rounding up
234+
# happened.
235+
236+
FloatComponentsResult(
237+
quo, nexp_final, inexact = !result_is_exact, underflowed = is_subnormal)
238+
end
239+
240+
# `num`, `den` are positive integers. `bit_width` is the requested
241+
# floating-point precision and must be positive. `T` is the integer
242+
# type that we'll be working with mostly, it needs to be wide enough.
243+
# `exp_lower_bound` is the minimum allowed normalized exponent for
244+
# normal numbers.
245+
function rational_to_float_components(num::Integer,
246+
den::Integer,
247+
bit_width::Integer,
248+
::Type{T},
249+
romo::RoundingMode,
250+
exp_lower_bound::Union{Nothing,Integer} =
251+
nothing) where {T<:Integer}
252+
h(::Nothing, ::Integer) = nothing
253+
h(a::Integer, b::Integer) = a - b
254+
255+
# Factor out powers of two
256+
tz_num = trailing_zeros(num)
257+
tz_den = trailing_zeros(den)
258+
dexp = tz_num - tz_den
259+
c = rational_to_float_components_impl(
260+
num >> tz_num, den >> tz_den, bit_width, T, romo, h(exp_lower_bound, dexp))
261+
FloatComponentsResult(
262+
c.mantissa, c.exponent + dexp, inexact = c.inexact, underflowed = c.underflowed)
263+
end
264+
265+
# Assuming the wanted rounding mode is round to nearest with ties to
266+
# even.
267+
#
268+
# `precision` must be positive.
269+
function rational_to_float_impl(to_float::C,
270+
::Type{T},
271+
x::Rational,
272+
precision::Integer) where {C<:Union{Type,Function},T<:Integer}
273+
s = Int8(sign(numerator(x)))
274+
a = abs(x)
275+
276+
num = numerator(a)
277+
den = denominator(a)
278+
279+
# Handle special cases
280+
num_is_zero = iszero(num)
281+
den_is_zero = iszero(den)
282+
if den_is_zero
283+
num_is_zero && return to_float(NaN) # 0/0
284+
return to_float(s * Inf) # n/0 = sign(n) * Inf
285+
end
286+
num_is_zero && return to_float(false) # 0/n = 0
287+
288+
F = typeof(to_float(false))
289+
c = rational_to_float_components(
290+
num, den, precision, T, RoundNearest, min_normal_number_exponent(F))
291+
bw = bit_width(c.mantissa)
292+
293+
# TODO: `ldexp` could be replaced with a mere bit of bit twiddling
294+
# in the case of `Float16`, `Float32`, `Float64`
295+
ldexp(s * to_float(c.mantissa), c.exponent - bw + true)::F
136296
end
137297

298+
function rational_to_float_promote_type(::Type{F},
299+
::Type{S}) where {F<:AbstractFloat,S<:Integer}
300+
BigInt
301+
end
302+
303+
function rational_to_float_promote_type(::Type{F},
304+
::Type{S}) where {F<:AbstractFloat,S<:Unsigned}
305+
rational_to_float_promote_type(F, widen(signed(S)))
306+
end
307+
308+
# As an optimization, use a narrower type when possible.
309+
rational_to_float_promote_type(::Type{Float16}, ::Type{<:Union{Int8,Int16}}) = Int32
310+
rational_to_float_promote_type(::Type{Float32}, ::Type{<:Union{Int8,Int16}}) = Int64
311+
rational_to_float_promote_type(::Type{Float64}, ::Type{<:Union{Int8,Int16}}) = Int128
312+
rational_to_float_promote_type(::Type{<:Union{Float16,Float32}}, ::Type{Int32}) = Int64
313+
rational_to_float_promote_type(::Type{Float64}, ::Type{Int32}) = Int128
314+
rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64}) = Int128
315+
316+
function rational_bool_to_float(to_float::C,
317+
x::Rational{Bool}) where {C<:Union{Type,Function}}
318+
n = numerator(x)
319+
if iszero(denominator(x))
320+
if iszero(n)
321+
to_float(NaN) # 0/0
322+
else
323+
to_float(Inf) # 1/0
324+
end
325+
else
326+
# n/1 = n
327+
to_float(n)
328+
end
329+
end
330+
331+
function rational_to_float(::Type{F}, x::Rational{Bool}) where {F<:AbstractFloat}
332+
rational_bool_to_float(F, x)::F
333+
end
334+
335+
function rational_to_float(::Type{F}, x::Rational{T}) where {F<:AbstractFloat,T<:Integer}
336+
rational_to_float_impl(F, rational_to_float_promote_type(F, T), x, precision(F))::F
337+
end
338+
339+
function (::Type{F})(x::Rational) where {F<:AbstractFloat}
340+
y = Rational(promote(numerator(x), denominator(x))...)
341+
rational_to_float(F, y)::F
342+
end
343+
344+
AbstractFloat(x::Q) where {Q<:Rational} = float(Q)(x)::AbstractFloat
345+
138346
function Rational{T}(x::AbstractFloat) where T<:Integer
139347
r = rationalize(T, x, tol=0)
140348
x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))

0 commit comments

Comments
 (0)