Base: make constructing a float from a rational exact

nsajko · nsajko · commit 28d0c8e526cc · 2023-05-17T04:17:43.000+02:00
Two example bugs that are now fixed: ```julia-repl julia> Float16(9//70000) # incorrect because `Float16(70000)` overflows Float16(0.0) julia> Float16(big(9//70000)) # correct because of promotion to `BigFloat` Float16(0.0001286) julia> Float32(16777216//16777217) < 1 # `16777217` doesn't fit in a `Float32` mantissa false ``` Another way to fix this would have been to convert the numerator and denominator into `BigFloat` exactly and then divide one with the other. However, the requirement for exactness means that the `BigFloat` precision would need to be manipulated, something that seems to be problematic in Julia. So implement the division using integer arithmetic. As a bonus, constructing a `BigFloat` from a `Rational` is now thread-safe when the rounding mode and precision are provided to the constructor. Updates #45213
diff --git a/base/float.jl b/base/float.jl
@@ -113,6 +113,10 @@ for T in (Float16, Float32, Float64)
     @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
 end
 
+# The minimum exponent for a normal number.
+min_normal_number_exponent(::Type{<:AbstractFloat}) = nothing
+min_normal_number_exponent(::Type{F}) where {F<:IEEEFloat} = true - exponent_bias(F)
+
 """
     exponent_max(T)
 
diff --git a/base/mpfr.jl b/base/mpfr.jl
@@ -17,7 +17,9 @@ import
         cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
         setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
         isone, big, _string_n, decompose, minmax,
-        sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
+        sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
+        rational_to_float_components, rational_bool_to_float, bit_width,
+        rounding_mode_translated_for_abs
 
 import ..Rounding: rounding_raw, setrounding_raw
 
@@ -280,12 +282,63 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
 BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
     BigFloat(Float64(x), r; precision=precision)
 
-function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
-    setprecision(BigFloat, precision) do
-        setrounding_raw(BigFloat, r) do
-            BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
-        end
+function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
+    ccall(
+        (:mpfr_set_z_2exp, libmpfr),
+        Int32,
+        (Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
+        z, n, exp, rm,
+    )
+    nothing
+end
+
+function rational_to_bigfloat(x::Rational, romo::MPFRRoundingMode, prec::Integer)
+    s = Int8(sign(numerator(x)))
+    a = abs(x)
+
+    num = numerator(a)
+    den = denominator(a)
+
+    # Handle special cases
+    num_is_zero = iszero(num)
+    den_is_zero = iszero(den)
+    if den_is_zero
+        num_is_zero && return BigFloat(precision = prec)  # 0/0
+        # n/0 = Inf * sign(n)
+        #
+        # The rounding mode doesn't matter for infinity.
+        return BigFloat(s * Inf, MPFRRoundToZero, precision = prec)
+    end
+    # 0/1 = 0
+    #
+    # The rounding mode doesn't matter for zero.
+    num_is_zero && return BigFloat(false, MPFRRoundToZero, precision = prec)
+
+    rm = rounding_mode_translated_for_abs(convert(RoundingMode, romo), s)
+    c = rational_to_float_components(num, den, prec, BigInt, rm)
+    bw = bit_width(c.mantissa)
+    ret = BigFloat(precision = prec)
+
+    # The rounding mode doesn't matter because there shouldn't be any
+    # rounding, as MPFR doesn't have subnormals.
+    set_2exp!(ret, s * c.mantissa, Int(c.exponent - bw + true), MPFRRoundToZero)
+
+    ret
+end
+
+function rational_to_bigfloat(x::Rational{Bool}, ::MPFRRoundingMode, prec::Integer)
+    # the rounding mode doesn't matter for conversion from booleans
+    conv = let p = prec
+        y -> BigFloat(y, MPFRRoundToZero, precision = p)
     end
+    rational_bool_to_float(conv, x)::BigFloat
+end
+
+function BigFloat(x::Rational,
+                  r::MPFRRoundingMode = ROUNDING_MODE[];
+                  precision::Integer = DEFAULT_PRECISION[])
+    y = Rational(promote(numerator(x), denominator(x))...)
+    rational_to_bigfloat(y, r, precision)::BigFloat
 end
 
 function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
diff --git a/base/rational.jl b/base/rational.jl
@@ -129,12 +129,220 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
 (::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
     throw(InexactError(nameof(T), T, x)))
 
-AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
-function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
-    P = promote_type(T,S)
-    convert(T, convert(P,x.num)/convert(P,x.den))::T
+bit_width(n) = ndigits(n, base = UInt8(2), pad = false)
+
+function divrem_pow2(num::Integer, n::Integer)
+    quot = num >> n
+    rema = num - (quot << n)
+    (quot, rema)
+end
+
+const RoundingModesIndependentFromSign = Union{
+    RoundingMode{:ToZero}, RoundingMode{:FromZero},
+    RoundingMode{:Nearest}, RoundingMode{:NearestTiesAway}}
+
+rounding_mode_translated_for_abs(rm::RoundingModesIndependentFromSign, ::Real) = rm
+
+function rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real)
+    !signbit(sign) ? RoundToZero : RoundFromZero
+end
+
+function rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real)
+    !signbit(sign) ? RoundFromZero : RoundToZero
+end
+
+clamped_to_zero(x) = max(zero(x), x)
+
+struct FloatComponentsResult{S<:Integer,T<:Integer}
+    mantissa::S
+    exponent::T
+    inexact::Bool
+    underflowed::Bool
+
+    function FloatComponentsResult(m::S, e::T;
+                                   inexact::Bool,
+                                   underflowed::Bool) where {S<:Integer,T<:Integer}
+        new{S,T}(m, e, inexact, underflowed)
+    end
+end
+
+# `num`, `den` are positive integers. `requested_precision` is the
+# requested floating-point precision. `T` is the integer type that
+# we'll be working with mostly, it needs to be wide enough.
+# `exp_lower_bound` is the minimum allowed normalized exponent for
+# normal numbers.
+function rational_to_float_components_impl(num::Integer,
+                                           den::Integer,
+                                           requested_precision::Integer,
+                                           ::Type{T},
+                                           romo::RoundingMode,
+                                           exp_lower_bound::Union{Nothing,Integer}) where
+{T<:Integer}
+    num_bit_width = bit_width(num)
+    den_bit_width = bit_width(den)
+
+    # `T` must be wide enough to avoid overflow.
+    numT = T(num)
+    denT = T(den)
+
+    # Creates a mantissa in `quo_` that's at least
+    # `requested_precision` bits wide.
+    bit_shift_num_ = clamped_to_zero(den_bit_width - num_bit_width + requested_precision)
+    quo_ = div(numT << bit_shift_num_, den, RoundToZero)
+
+    # Nonnegative. Experiments indicate that, when iterating over all
+    # possible numerators and denominators below some bit width, with
+    # some fixed value for `requested_precision`, the maximal attained
+    # value will be
+    # `max(1, max(num_bit_width, den_bit_width) - requested_precision)`.
+    # So in the worst case we have `requested_precision == 1` and
+    # `excess_precision == max(1, max(num_bit_width, den_bit_width) - 1)`
+    excess_precision = bit_width(quo_) - requested_precision
+
+    # Normalized exponent
+    nexp = bit_width(quo_) - bit_shift_num_ - true
+
+    # Take possible underflow into account: if there's underflow, the
+    # precision needs to be less than requested.
+    adjusted_precision = requested_precision
+    if !isnothing(exp_lower_bound)
+        underflow = clamped_to_zero(exp_lower_bound - nexp)
+        adjusted_precision -= underflow
+    end
+    adjusted_excess_precision = bit_width(quo_) - adjusted_precision
+
+    (adjusted_precision < false) && return FloatComponentsResult(
+        zero(quo_), zero(nexp), inexact = true, underflowed = true)
+
+    # Creates a mantissa in `quo` that's exactly `adjusted_precision`
+    # bits wide, except if rounding up happens, in which case the bit
+    # width may be `adjusted_precision + 1`.
+    bit_shift_num = clamped_to_zero(bit_shift_num_ - adjusted_excess_precision)
+    bit_shift_den = adjusted_excess_precision - (bit_shift_num_ - bit_shift_num)  # nonnegative
+    (quo, rem) = divrem(numT << bit_shift_num, denT << bit_shift_den, romo)
+
+    result_is_exact = iszero(rem)
+
+    nexp_final = nexp + (adjusted_precision < bit_width(quo))
+    is_subnormal = !isnothing(exp_lower_bound) && (nexp_final < exp_lower_bound)
+
+    iszero(quo) || (quo >>= trailing_zeros(quo))
+
+    # The bit width of `quo` is now less than or equal to
+    # `adjusted_precision`, except if `adjusted_precision` is zero, in
+    # which case `bit_width(quo)` may be one, in case rounding up
+    # happened.
+
+    FloatComponentsResult(
+        quo, nexp_final, inexact = !result_is_exact, underflowed = is_subnormal)
+end
+
+# `num`, `den` are positive integers. `bit_width` is the requested
+# floating-point precision and must be positive. `T` is the integer
+# type that we'll be working with mostly, it needs to be wide enough.
+# `exp_lower_bound` is the minimum allowed normalized exponent for
+# normal numbers.
+function rational_to_float_components(num::Integer,
+                                      den::Integer,
+                                      bit_width::Integer,
+                                      ::Type{T},
+                                      romo::RoundingMode,
+                                      exp_lower_bound::Union{Nothing,Integer} =
+                                          nothing) where {T<:Integer}
+    h(::Nothing, ::Integer) = nothing
+    h(a::Integer, b::Integer) = a - b
+
+    # Factor out powers of two
+    tz_num = trailing_zeros(num)
+    tz_den = trailing_zeros(den)
+    dexp = tz_num - tz_den
+    c = rational_to_float_components_impl(
+        num >> tz_num, den >> tz_den, bit_width, T, romo, h(exp_lower_bound, dexp))
+    FloatComponentsResult(
+        c.mantissa, c.exponent + dexp, inexact = c.inexact, underflowed = c.underflowed)
+end
+
+# Assuming the wanted rounding mode is round to nearest with ties to
+# even.
+#
+# `precision` must be positive.
+function rational_to_float_impl(to_float::C,
+                                ::Type{T},
+                                x::Rational,
+                                precision::Integer) where {C<:Union{Type,Function},T<:Integer}
+    s = Int8(sign(numerator(x)))
+    a = abs(x)
+
+    num = numerator(a)
+    den = denominator(a)
+
+    # Handle special cases
+    num_is_zero = iszero(num)
+    den_is_zero = iszero(den)
+    if den_is_zero
+        num_is_zero && return to_float(NaN)  # 0/0
+        return to_float(s * Inf)  # n/0 = sign(n) * Inf
+    end
+    num_is_zero && return to_float(false)  # 0/n = 0
+
+    F = typeof(to_float(false))
+    c = rational_to_float_components(
+        num, den, precision, T, RoundNearest, min_normal_number_exponent(F))
+    bw = bit_width(c.mantissa)
+
+    # TODO: `ldexp` could be replaced with a mere bit of bit twiddling
+    # in the case of `Float16`, `Float32`, `Float64`
+    ldexp(s * to_float(c.mantissa), c.exponent - bw + true)::F
 end
 
+function rational_to_float_promote_type(::Type{F},
+                                        ::Type{S}) where {F<:AbstractFloat,S<:Integer}
+    BigInt
+end
+
+function rational_to_float_promote_type(::Type{F},
+                                        ::Type{S}) where {F<:AbstractFloat,S<:Unsigned}
+    rational_to_float_promote_type(F, widen(signed(S)))
+end
+
+# As an optimization, use a narrower type when possible.
+rational_to_float_promote_type(::Type{Float16},                          ::Type{<:Union{Int8,Int16}}) = Int32
+rational_to_float_promote_type(::Type{Float32},                          ::Type{<:Union{Int8,Int16}}) = Int64
+rational_to_float_promote_type(::Type{Float64},                          ::Type{<:Union{Int8,Int16}}) = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32}},         ::Type{Int32})               = Int64
+rational_to_float_promote_type(::Type{Float64},                          ::Type{Int32})               = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64})               = Int128
+
+function rational_bool_to_float(to_float::C,
+                                x::Rational{Bool}) where {C<:Union{Type,Function}}
+    n = numerator(x)
+    if iszero(denominator(x))
+        if iszero(n)
+            to_float(NaN)  # 0/0
+        else
+            to_float(Inf)  # 1/0
+        end
+    else
+        # n/1 = n
+        to_float(n)
+    end
+end
+
+function rational_to_float(::Type{F}, x::Rational{Bool}) where {F<:AbstractFloat}
+    rational_bool_to_float(F, x)::F
+end
+
+function rational_to_float(::Type{F}, x::Rational{T}) where {F<:AbstractFloat,T<:Integer}
+    rational_to_float_impl(F, rational_to_float_promote_type(F, T), x, precision(F))::F
+end
+
+function (::Type{F})(x::Rational) where {F<:AbstractFloat}
+    y = Rational(promote(numerator(x), denominator(x))...)
+    rational_to_float(F, y)::F
+end
+
+AbstractFloat(x::Q) where {Q<:Rational} = float(Q)(x)::AbstractFloat
+
 function Rational{T}(x::AbstractFloat) where T<:Integer
     r = rationalize(T, x, tol=0)
     x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))
diff --git a/test/rational.jl b/test/rational.jl