Skip to content

Commit c517246

Browse files
committed
Base: make constructing a float from a rational exact
Two example bugs that are now fixed: ```julia-repl julia> Float16(9//70000) # incorrect because `Float16(70000)` overflows Float16(0.0) julia> Float16(big(9//70000)) # correct because of promotion to `BigFloat` Float16(0.0001286) julia> Float32(16777216//16777217) < 1 # `16777217` doesn't fit in a `Float32` mantissa false ``` Another way to fix this would have been to convert the numerator and denominator into `BigFloat` exactly and then divide one with the other. However, the requirement for exactness means that the `BigFloat` precision would need to be manipulated, something that seems to be problematic in Julia. So implement the division using integer arithmetic. Updates #45213
1 parent b21f100 commit c517246

File tree

3 files changed

+483
-11
lines changed

3 files changed

+483
-11
lines changed

base/mpfr.jl

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ import
1717
cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
1818
setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
1919
isone, big, _string_n, decompose, minmax,
20-
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
20+
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
21+
to_int8_if_bool, rational_to_float_components, rational_to_float_impl,
22+
rounding_mode_translated_for_abs
2123

2224
import ..Rounding: rounding_raw, setrounding_raw
2325

@@ -280,14 +282,73 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
280282
BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
281283
BigFloat(Float64(x), r; precision=precision)
282284

283-
function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
284-
setprecision(BigFloat, precision) do
285-
setrounding_raw(BigFloat, r) do
286-
BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
287-
end
288-
end
285+
function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
286+
ccall(
287+
(:mpfr_set_z_2exp, libmpfr),
288+
Int32,
289+
(Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
290+
z, n, exp, rm,
291+
)
292+
nothing
289293
end
290294

295+
rational_to_bigfloat(
296+
::Type{<:Integer},
297+
x::Rational{Bool},
298+
requested_precision::Integer,
299+
::RoundingMode,
300+
) = rational_to_float_impl(
301+
x, Bool, -1,
302+
let prec = requested_precision
303+
y -> BigFloat(y, precision = prec)
304+
end,
305+
)::BigFloat
306+
307+
function rational_to_bigfloat(
308+
::Type{T},
309+
x::Rational,
310+
requested_precision::Integer,
311+
romo::MPFRRoundingMode,
312+
) where {T<:Integer}
313+
s = Int8(sign(numerator(x)))
314+
a = abs(x)
315+
316+
num = to_int8_if_bool(numerator(a))
317+
den = to_int8_if_bool(denominator(a))
318+
319+
# Handle special cases
320+
num_is_zero = iszero(num)
321+
den_is_zero = iszero(den)
322+
if den_is_zero
323+
num_is_zero && return BigFloat(precision = requested_precision)
324+
return BigFloat(s * Inf, precision = requested_precision)
325+
end
326+
num_is_zero && return BigFloat(false, precision = requested_precision)
327+
328+
components = rational_to_float_components(
329+
num,
330+
den,
331+
requested_precision,
332+
T,
333+
rounding_mode_translated_for_abs(convert(RoundingMode, romo), s),
334+
)
335+
ret = BigFloat(precision = requested_precision)
336+
337+
# The rounding mode doesn't matter because there shouldn't be any
338+
# rounding (MPFR doesn't have subnormals).
339+
set_2exp!(ret, s * components.mantissa, Int(components.exponent), MPFRRoundToZero)
340+
341+
ret
342+
end
343+
344+
BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
345+
rational_to_bigfloat(
346+
BigInt,
347+
x,
348+
precision,
349+
r,
350+
)::BigFloat
351+
291352
function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
292353
!isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base = base)
293354
z = BigFloat(precision=precision)

base/rational.jl

Lines changed: 293 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,301 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
129129
(::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
130130
throw(InexactError(nameof(T), T, x)))
131131

132-
AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
133-
function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
134-
P = promote_type(T,S)
135-
convert(T, convert(P,x.num)/convert(P,x.den))::T
132+
bit_width(n) = ndigits(n, base = UInt8(2), pad = false)
133+
134+
function divrem_pow2(num::Integer, n::Integer)
135+
quot = num >> n
136+
rema = num - (quot << n)
137+
(quot, rema)
138+
end
139+
140+
rounding_mode_translated_for_abs(
141+
rm::Union{
142+
RoundingMode{:ToZero},
143+
RoundingMode{:FromZero},
144+
RoundingMode{:Nearest},
145+
RoundingMode{:NearestTiesAway},
146+
},
147+
::Real,
148+
) =
149+
rm
150+
151+
rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real) =
152+
!signbit(sign) ? RoundToZero : RoundFromZero
153+
154+
rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real) =
155+
!signbit(sign) ? RoundFromZero : RoundToZero
156+
157+
# `num`, `den` are positive integers. `requested_bit_width` is the
158+
# requested floating-point precision. `T` is the integer type that
159+
# we'll be working with mostly, it needs to be wide enough.
160+
function rational_to_float_components_impl(
161+
num::Integer,
162+
den::Integer,
163+
requested_bit_width::Integer,
164+
::Type{T},
165+
romo::RoundingMode,
166+
) where {T<:Integer}
167+
num_bit_width = bit_width(num)
168+
den_bit_width = bit_width(den)
169+
170+
numT = T(num)
171+
denT = T(den)
172+
173+
# Must be positive.
174+
bit_shift_1 = den_bit_width - num_bit_width + requested_bit_width
175+
(false bit_shift_1) || (bit_shift_1 = zero(bit_shift_1))
176+
177+
# `T` must be wide enough to make overflow impossible during the
178+
# left shift here, we must not lose the high bits.
179+
#
180+
# Similarly, `scaled_num` must not be negative.
181+
scaled_num = numT << bit_shift_1
182+
183+
# `quo0` must be at least `requested_bit_width` bits wide.
184+
(quo0, rem0) = divrem(scaled_num, denT, RoundToZero)
185+
quo0_bit_width = bit_width(quo0)
186+
187+
# Now we have a mantissa in `quo0`, but need to round it to the
188+
# required precision.
189+
190+
# Should often be zero, but never negative.
191+
bit_shift_2 = quo0_bit_width - requested_bit_width
192+
193+
# `quo1` is `div(scaled_num, denT << bit_shift_2, RoundToZero)` and should
194+
# be exactly `requested_bit_width` bits wide.
195+
(quo1, rem1) = divrem_pow2(quo0, bit_shift_2)
196+
197+
# `rem(scaled_num, denT << bit_shift_2, RoundToZero)`, but without the extra
198+
# computation.
199+
rem_total = rem1 * den + rem0
200+
201+
result_is_exact = iszero(rem_total)
202+
203+
mantissa = quo1
204+
205+
mantissa_carry = false
206+
207+
romo_is_to = romo == RoundToZero
208+
romo_is_af = romo == RoundFromZero
209+
romo_is_ntte = romo == RoundNearest
210+
romo_is_ntaf = romo == RoundNearestTiesAway
211+
212+
romo_is_to_nearest = romo_is_ntte | romo_is_ntaf
213+
214+
if !result_is_exact & !romo_is_to
215+
# Finish the rounding
216+
217+
(rem_quo, rem_rem) = divrem_pow2(rem_total, bit_shift_2)
218+
to_nearest_compar_hi = rem_quo - (den >> true)
219+
to_nearest_compar_lo = (rem_rem << true) - ((den & true) << bit_shift_2)
220+
to_nearest_compar_hi_iszero = iszero(to_nearest_compar_hi)
221+
to_nearest_compar_lo_iszero = iszero(to_nearest_compar_lo)
222+
to_nearest_is_greater_than_half =
223+
(false < to_nearest_compar_hi) |
224+
(
225+
to_nearest_compar_hi_iszero &
226+
(false < to_nearest_compar_lo)
227+
)
228+
to_nearest_is_tied =
229+
to_nearest_compar_hi_iszero &
230+
to_nearest_compar_lo_iszero
231+
232+
mantissa_is_even = iszero(mantissa & true)
233+
234+
# True iff precision is one.
235+
mantissa_is_one = isone(mantissa)
236+
237+
if (
238+
romo_is_af |
239+
(
240+
romo_is_to_nearest &
241+
(
242+
to_nearest_is_greater_than_half |
243+
(
244+
to_nearest_is_tied &
245+
!mantissa_is_one &
246+
(romo_is_ntaf | (romo_is_ntte & !mantissa_is_even))
247+
)
248+
)
249+
)
250+
)
251+
# Round up
252+
253+
# Assuming `T` is wide enough and there's no overflow.
254+
mantissa += true
255+
256+
# We need to decrease the bit width in case it increased.
257+
mantissa_carry = ispow2(mantissa)
258+
mantissa >>= mantissa_carry
259+
elseif romo_is_to_nearest & to_nearest_is_tied & mantissa_is_one
260+
# Mantissa is one, which means the precision is also
261+
# one. Be consistent with the `BigFloat` behavior, for
262+
# example: `BigFloat(3) == BigFloat(3.0) == 4`.
263+
mantissa_carry = true
264+
end
265+
end
266+
267+
# `mantissa` should now again be exactly `requested_bit_width` bits
268+
# wide.
269+
270+
exp = bit_shift_2 - bit_shift_1 + mantissa_carry
271+
272+
(
273+
mantissa = mantissa,
274+
exponent = exp,
275+
is_exact = result_is_exact,
276+
)
277+
end
278+
279+
# `num`, `den` are positive integers. `requested_bit_width` is the
280+
# requested floating-point precision and must be positive. `T` is the
281+
# integer type that we'll be working with mostly, it needs to be wide
282+
# enough.
283+
function rational_to_float_components(
284+
num::Integer,
285+
den::Integer,
286+
requested_bit_width::Integer,
287+
::Type{T},
288+
romo::RoundingMode,
289+
) where {T<:Integer}
290+
(false < requested_bit_width) || error("nonpositive bit width")
291+
292+
# Factor out powers of two
293+
trailing_zeros_num = trailing_zeros(num)
294+
trailing_zeros_den = trailing_zeros(den)
295+
num >>= trailing_zeros_num
296+
den >>= trailing_zeros_den
297+
298+
c = rational_to_float_components_impl(
299+
num, den, requested_bit_width, T, romo,
300+
)
301+
302+
(
303+
mantissa = c.mantissa,
304+
exponent = c.exponent + trailing_zeros_num - trailing_zeros_den,
305+
is_exact = c.is_exact,
306+
)
307+
end
308+
309+
function rational_to_float_impl(
310+
to_float::C,
311+
::Type{<:Integer},
312+
x::Rational{Bool},
313+
::Integer,
314+
) where {C<:Union{Type,Function}}
315+
n = numerator(x)
316+
if iszero(denominator(x))
317+
if iszero(n)
318+
to_float(NaN) # 0/0
319+
else
320+
to_float(Inf) # 1/0
321+
end
322+
else
323+
# n/1 = n
324+
to_float(n)
325+
end
136326
end
137327

328+
to_int8_if_bool(n::Bool) = Int8(n)
329+
to_int8_if_bool(n::Integer) = n
330+
331+
# Assuming the wanted rounding mode is round to nearest with ties to
332+
# even.
333+
#
334+
# `requested_precision` must be positive.
335+
function rational_to_float_impl(
336+
to_float::C,
337+
::Type{T},
338+
x::Rational,
339+
requested_precision::Integer,
340+
) where {C<:Union{Type,Function},T<:Integer}
341+
s = Int8(sign(numerator(x)))
342+
a = abs(x)
343+
344+
num = to_int8_if_bool(numerator(a))
345+
den = to_int8_if_bool(denominator(a))
346+
347+
# Handle special cases
348+
num_is_zero = iszero(num)
349+
den_is_zero = iszero(den)
350+
if den_is_zero
351+
num_is_zero && return to_float(NaN)
352+
return to_float(s * Inf)
353+
end
354+
num_is_zero && return to_float(false)
355+
356+
components = rational_to_float_components(
357+
num,
358+
den,
359+
requested_precision,
360+
T,
361+
RoundNearest,
362+
)
363+
mantissa = to_float(components.mantissa)
364+
365+
# TODO: `ldexp` could be replaced with a mere bit of bit twiddling
366+
# in the case of `Float16`, `Float32`, `Float64`
367+
ret = ldexp(s * mantissa, components.exponent)
368+
369+
# TODO: faster?
370+
if iszero(ret) | issubnormal(ret)
371+
# "Rounding to odd" to prevent double rounding error, see
372+
# https://hal-ens-lyon.archives-ouvertes.fr/inria-00080427v2
373+
components = rational_to_float_components(
374+
num,
375+
den,
376+
requested_precision,
377+
T,
378+
RoundToZero,
379+
)
380+
mantissa = to_float(components.mantissa | !components.is_exact)
381+
382+
# TODO: `ldexp` could be replaced with a mere bit of bit
383+
# twiddling in the case of `Float16`, `Float32`, `Float64`
384+
ret = ldexp(s * mantissa, components.exponent)
385+
end
386+
387+
ret
388+
end
389+
390+
rational_to_float_promote_type(
391+
::Type{F},
392+
::Type{S},
393+
) where {F<:AbstractFloat,S<:Integer} =
394+
BigInt
395+
396+
rational_to_float_promote_type(
397+
::Type{F},
398+
::Type{S},
399+
) where {F<:AbstractFloat,S<:Unsigned} =
400+
rational_to_float_promote_type(F, signed(S))
401+
402+
# As an optimization, use a narrower type when possible.
403+
rational_to_float_promote_type(::Type{Float16}, ::Type{<:Union{Int8,Int16}}) = Int32
404+
rational_to_float_promote_type(::Type{Float32}, ::Type{<:Union{Int8,Int16}}) = Int64
405+
rational_to_float_promote_type(::Type{Float64}, ::Type{<:Union{Int8,Int16}}) = Int128
406+
rational_to_float_promote_type(::Type{<:Union{Float16,Float32}}, ::Type{Int32}) = Int64
407+
rational_to_float_promote_type(::Type{Float64}, ::Type{Int32}) = Int128
408+
rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64}) = Int128
409+
410+
(::Type{F})(x::Rational) where {F<:AbstractFloat} =
411+
rational_to_float_impl(
412+
F,
413+
rational_to_float_promote_type(
414+
F,
415+
promote_type(
416+
typeof(numerator(x)),
417+
typeof(denominator(x)),
418+
),
419+
),
420+
x,
421+
precision(F),
422+
)::F
423+
424+
AbstractFloat(x::Q) where {Q<:Rational} =
425+
float(Q)(x)::AbstractFloat
426+
138427
function Rational{T}(x::AbstractFloat) where T<:Integer
139428
r = rationalize(T, x, tol=0)
140429
x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))

0 commit comments

Comments
 (0)