Base: make constructing a float from a rational exact

nsajko · nsajko · commit 2a109196d831 · 2023-05-11T22:44:27.000+02:00
Two example bugs that are now fixed: ```julia-repl julia> Float16(9//70000) # incorrect because `Float16(70000)` overflows Float16(0.0) julia> Float16(big(9//70000)) # correct because of promotion to `BigFloat` Float16(0.0001286) julia> Float32(16777216//16777217) < 1 # `16777217` doesn't fit in a `Float32` mantissa false ``` Another way to fix this would have been to convert the numerator and denominator into `BigFloat` exactly and then divide one with the other. However, the requirement for exactness means that the `BigFloat` precision would need to be manipulated, something that seems to be problematic in Julia. So implement the division using integer arithmetic and `ldexp`. Updates #45213
diff --git a/base/mpfr.jl b/base/mpfr.jl
@@ -17,7 +17,9 @@ import
         cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
         setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
         isone, big, _string_n, decompose, minmax,
-        sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
+        sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
+        to_int8_if_bool, rational_to_float_components, rational_to_float_impl,
+        rounding_mode_translated_for_abs
 
 import ..Rounding: rounding_raw, setrounding_raw
 
@@ -280,14 +282,73 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
 BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
     BigFloat(Float64(x), r; precision=precision)
 
-function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
-    setprecision(BigFloat, precision) do
-        setrounding_raw(BigFloat, r) do
-            BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
-        end
-    end
+function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
+    ccall(
+        (:mpfr_set_z_2exp, libmpfr),
+        Int32,
+        (Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
+        z, n, exp, rm,
+    )
+    nothing
 end
 
+rational_to_bigfloat(
+    ::Type{<:Integer},
+    x::Rational{Bool},
+    requested_precision::Integer,
+    ::RoundingMode,
+) = rational_to_float_impl(
+    x, Bool, -1,
+    let prec = requested_precision
+        y -> BigFloat(y, precision = prec)
+    end,
+)::BigFloat
+
+function rational_to_bigfloat(
+    ::Type{T},
+    x::Rational,
+    requested_precision::Integer,
+    romo::MPFRRoundingMode,
+) where {T<:Integer}
+    s = Int8(sign(numerator(x)))
+    a = abs(x)
+
+    num = to_int8_if_bool(numerator(a))
+    den = to_int8_if_bool(denominator(a))
+
+    # Handle special cases
+    num_is_zero = iszero(num)
+    den_is_zero = iszero(den)
+    if den_is_zero
+        num_is_zero && return BigFloat(precision = requested_precision)
+        return BigFloat(s * Inf, precision = requested_precision)
+    end
+    num_is_zero && return BigFloat(false, precision = requested_precision)
+
+    components = rational_to_float_components(
+      num,
+      den,
+      requested_precision,
+      T,
+      rounding_mode_translated_for_abs(convert(RoundingMode, romo), s),
+    )
+    ret = BigFloat(precision = requested_precision)
+
+    # The rounding mode doesn't matter because there shouldn't be any
+    # rounding (MPFR doesn't have subnormals).
+    set_2exp!(ret, s * components.mantissa, Int(components.exponent), MPFRRoundToZero)
+
+    ret
+end
+
+BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
+    rational_to_bigfloat(
+        BigInt,
+        x,
+        precision,
+        r,
+    )::BigFloat
+
 function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
     !isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base = base)
     z = BigFloat(precision=precision)
diff --git a/base/rational.jl b/base/rational.jl
@@ -129,12 +129,300 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
 (::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
     throw(InexactError(nameof(T), T, x)))
 
-AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
-function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
-    P = promote_type(T,S)
-    convert(T, convert(P,x.num)/convert(P,x.den))::T
+bit_width(n) = ndigits(n, base = UInt8(2), pad = false)
+
+function divrem_pow2(num::Integer, n::Integer)
+    quot = num >> n
+    rema = num - (quot << n)
+    (quot, rema)
+end
+
+rounding_mode_translated_for_abs(
+    rm::Union{
+        RoundingMode{:ToZero},
+        RoundingMode{:FromZero},
+        RoundingMode{:Nearest},
+        RoundingMode{:NearestTiesAway},
+    },
+    ::Real,
+) =
+    rm
+
+rounding_mode_translated_for_abs(::RoundingMode{:Down}, sign::Real) =
+    !signbit(sign) ? RoundToZero : RoundFromZero
+
+rounding_mode_translated_for_abs(::RoundingMode{:Up}, sign::Real) =
+    !signbit(sign) ? RoundFromZero : RoundToZero
+
+# `num`, `den` are positive integers. `requested_bit_width` is the
+# requested floating-point precision. `T` is the integer type that
+# we'll be working with mostly, it needs to be wide enough.
+function rational_to_float_components_impl(
+    num::Integer,
+    den::Integer,
+    requested_bit_width::Integer,
+    ::Type{T},
+    romo::RoundingMode,
+) where {T<:Integer}
+    num_bit_width = bit_width(num)
+    den_bit_width = bit_width(den)
+
+    numT = T(num)
+    denT = T(den)
+
+    # Must be positive.
+    bit_shift_1 = den_bit_width - num_bit_width + requested_bit_width
+    (false ≤ bit_shift_1) || (bit_shift_1 = zero(bit_shift_1))
+
+    # `T` must be wide enough to make overflow impossible during the
+    # left shift here, we must not lose the high bits.
+    #
+    # Similarly, `scaled_num` must not be negative.
+    scaled_num = numT << bit_shift_1
+
+    # `quo0` must be at least `requested_bit_width` bits wide.
+    (quo0, rem0) = divrem(scaled_num, denT, RoundToZero)
+    quo0_bit_width = bit_width(quo0)
+
+    # Now we have a mantissa in `quo0`, but need to round it to the
+    # required precision.
+
+    # Should often be zero, but never negative.
+    bit_shift_2 = quo0_bit_width - requested_bit_width
+
+    # `quo1` is `div(scaled_num, denT << bit_shift_2, RoundToZero)` and should
+    # be exactly `requested_bit_width` bits wide.
+    (quo1, rem1) = divrem_pow2(quo0, bit_shift_2)
+
+    # `rem(scaled_num, denT << bit_shift_2, RoundToZero)`, but without the extra
+    # computation.
+    rem_total = rem1 * den + rem0
+
+    result_is_exact = iszero(rem_total)
+
+    mantissa = quo1
+
+    mantissa_carry = false
+
+    romo_is_to = romo == RoundToZero
+    romo_is_af = romo == RoundFromZero
+    romo_is_ntte = romo == RoundNearest
+    romo_is_ntaf = romo == RoundNearestTiesAway
+
+    romo_is_to_nearest = romo_is_ntte | romo_is_ntaf
+
+    if !result_is_exact & !romo_is_to
+        # Finish the rounding
+
+        (rem_quo, rem_rem) = divrem_pow2(rem_total, bit_shift_2)
+        to_nearest_compar_hi = rem_quo - (den >> true)
+        to_nearest_compar_lo = (rem_rem << true) - ((den & true) << bit_shift_2)
+        to_nearest_compar_hi_iszero = iszero(to_nearest_compar_hi)
+        to_nearest_compar_lo_iszero = iszero(to_nearest_compar_lo)
+        to_nearest_is_greater_than_half =
+            (false < to_nearest_compar_hi) |
+            (
+                to_nearest_compar_hi_iszero &
+                (false < to_nearest_compar_lo)
+            )
+        to_nearest_is_tied =
+            to_nearest_compar_hi_iszero &
+            to_nearest_compar_lo_iszero
+
+        mantissa_is_even = iszero(mantissa & true)
+
+        # True iff precision is one.
+        mantissa_is_one = isone(mantissa)
+
+        if (
+            romo_is_af |
+            (
+                romo_is_to_nearest &
+                (
+                    to_nearest_is_greater_than_half |
+                    (
+                        to_nearest_is_tied &
+                        !mantissa_is_one &
+                        (romo_is_ntaf | (romo_is_ntte & !mantissa_is_even))
+                    )
+                )
+            )
+        )
+            # Round up
+
+            # Assuming `T` is wide enough and there's no overflow.
+            mantissa += true
+
+            # We need to decrease the bit width in case it increased.
+            mantissa_carry = ispow2(mantissa)
+            mantissa >>= mantissa_carry
+        elseif romo_is_to_nearest & to_nearest_is_tied & mantissa_is_one
+            # Mantissa is one, which means the precision is also
+            # one. Be consistent with the `BigFloat` behavior, for
+            # example: `BigFloat(3) == BigFloat(3.0) == 4`.
+            mantissa_carry = true
+        end
+    end
+
+    # `mantissa` should now again be exactly `requested_bit_width` bits
+    # wide.
+
+    exp = bit_shift_2 - bit_shift_1 + mantissa_carry
+
+    (
+        mantissa = mantissa,
+        exponent = exp,
+        is_exact = result_is_exact,
+    )
+end
+
+# `num`, `den` are positive integers. `requested_bit_width` is the
+# requested floating-point precision. `T` is the integer type that
+# we'll be working with mostly, it needs to be wide enough.
+function rational_to_float_components(
+    num::Integer,
+    den::Integer,
+    requested_bit_width::Integer,
+    ::Type{T},
+    romo::RoundingMode,
+) where {T<:Integer}
+    (false < requested_bit_width) || error("nonpositive bit width")
+
+    # Factor out powers of two
+    trailing_zeros_num = trailing_zeros(num)
+    trailing_zeros_den = trailing_zeros(den)
+    num >>= trailing_zeros_num
+    den >>= trailing_zeros_den
+
+    c = rational_to_float_components_impl(
+        num, den, requested_bit_width, T, romo,
+    )
+
+    (
+        mantissa = c.mantissa,
+        exponent = c.exponent + trailing_zeros_num - trailing_zeros_den,
+        is_exact = c.is_exact,
+    )
+end
+
+function rational_to_float_impl(
+    to_float::C,
+    ::Type{<:Integer},
+    x::Rational{Bool},
+    ::Integer,
+) where {C<:Union{Type,Function}}
+    n = numerator(x)
+    if iszero(denominator(x))
+        if iszero(n)
+            to_float(NaN)  # 0/0
+        else
+            to_float(Inf)  # 1/0
+        end
+    else
+        # n/1 = n
+        to_float(n)
+    end
 end
 
+to_int8_if_bool(n::Bool) = Int8(n)
+to_int8_if_bool(n::Integer) = n
+
+# Assuming the wanted rounding mode is round to nearest with ties to
+# even.
+#
+# `requested_precision` must be positive.
+function rational_to_float_impl(
+    to_float::C,
+    ::Type{T},
+    x::Rational,
+    requested_precision::Integer,
+) where {C<:Union{Type,Function},T<:Integer}
+    s = Int8(sign(numerator(x)))
+    a = abs(x)
+
+    num = to_int8_if_bool(numerator(a))
+    den = to_int8_if_bool(denominator(a))
+
+    # Handle special cases
+    num_is_zero = iszero(num)
+    den_is_zero = iszero(den)
+    if den_is_zero
+        num_is_zero && return to_float(NaN)
+        return to_float(s * Inf)
+    end
+    num_is_zero && return to_float(false)
+
+    components = rational_to_float_components(
+      num,
+      den,
+      requested_precision,
+      T,
+      RoundNearest,
+    )
+    mantissa = to_float(components.mantissa)
+
+    # TODO: `ldexp` could be replaced with a mere bit of bit twiddling
+    # in the case of `Float16`, `Float32`, `Float64`
+    ret = ldexp(s * mantissa, components.exponent)
+
+    # TODO: faster?
+    if iszero(ret) | issubnormal(ret)
+        # "Rounding to odd" to prevent double rounding error, see
+        # https://hal-ens-lyon.archives-ouvertes.fr/inria-00080427v2
+        components = rational_to_float_components(
+          num,
+          den,
+          requested_precision,
+          T,
+          RoundToZero,
+        )
+        mantissa = to_float(components.mantissa | !components.is_exact)
+
+        # TODO: `ldexp` could be replaced with a mere bit of bit
+        # twiddling in the case of `Float16`, `Float32`, `Float64`
+        ret = ldexp(s * mantissa, components.exponent)
+    end
+
+    ret
+end
+
+rational_to_float_promote_type(
+    ::Type{F},
+    ::Type{S},
+) where {F<:AbstractFloat,S<:Integer} =
+    BigInt
+
+rational_to_float_promote_type(
+    ::Type{F},
+    ::Type{S},
+) where {F<:AbstractFloat,S<:Unsigned} =
+    rational_to_float_promote_type(F, signed(S))
+
+# As an optimization, use a narrower type when possible.
+rational_to_float_promote_type(::Type{Float16},                          ::Type{<:Union{Int8,Int16}}) = Int32
+rational_to_float_promote_type(::Type{Float32},                          ::Type{<:Union{Int8,Int16}}) = Int64
+rational_to_float_promote_type(::Type{Float64},                          ::Type{<:Union{Int8,Int16}}) = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32}},         ::Type{Int32})               = Int64
+rational_to_float_promote_type(::Type{Float64},                          ::Type{Int32})               = Int128
+rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64})               = Int128
+
+(::Type{F})(x::Rational) where {F<:AbstractFloat} =
+    rational_to_float_impl(
+        F,
+        rational_to_float_promote_type(
+            F,
+            promote_type(
+                typeof(numerator(x)),
+                typeof(denominator(x)),
+            ),
+        ),
+        x,
+        precision(F),
+    )::F
+
+AbstractFloat(x::Q) where {Q<:Rational} =
+    float(Q)(x)::AbstractFloat
+
 function Rational{T}(x::AbstractFloat) where T<:Integer
     r = rationalize(T, x, tol=0)
     x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))
diff --git a/test/rational.jl b/test/rational.jl