|
26 | 26 | namespace LIBC_NAMESPACE_DECL { |
27 | 27 | namespace fputil { |
28 | 28 |
|
| 29 | +// Decide whether to round up a UInt at a given bit position, based on |
| 30 | +// the current rounding mode. The assumption is that the caller is |
| 31 | +// going to make the integer `value >> rshift`, and then might need to |
| 32 | +// round it up by 1 depending on the value of the bits shifted off the |
| 33 | +// bottom. |
| 34 | +// |
| 35 | +// `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to |
| 36 | +// be reversed, which is what you'd want if this is the mantissa of a |
| 37 | +// negative floating-point number. |
| 38 | +template <size_t Bits> |
| 39 | +LIBC_INLINE constexpr bool |
| 40 | +need_to_round_up(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift, |
| 41 | + Sign logical_sign) { |
| 42 | + switch (quick_get_round()) { |
| 43 | + case FE_TONEAREST: |
| 44 | + if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) { |
| 45 | + // We round up, unless the value is an exact halfway case and |
| 46 | + // the bit that will end up in the units place is 0, in which |
| 47 | + // case tie-break-to-even says round down. |
| 48 | + return value.get_bit(rshift) != 0 || (value << (Bits - rshift + 1)) != 0; |
| 49 | + } else { |
| 50 | + return false; |
| 51 | + } |
| 52 | + case FE_TOWARDZERO: |
| 53 | + return false; |
| 54 | + case FE_DOWNWARD: |
| 55 | + return logical_sign.is_neg() && (value << (Bits - rshift)) != 0; |
| 56 | + case FE_UPWARD: |
| 57 | + return logical_sign.is_pos() && (value << (Bits - rshift)) != 0; |
| 58 | + default: |
| 59 | + __builtin_unreachable(); |
| 60 | + } |
| 61 | +} |
| 62 | + |
29 | 63 | // A generic class to perform computations of high precision floating points. |
30 | 64 | // We store the value in dyadic format, including 3 fields: |
31 | 65 | // sign : boolean value - false means positive, true means negative |
@@ -101,6 +135,27 @@ template <size_t Bits> struct DyadicFloat { |
101 | 135 | return exponent + (Bits - 1); |
102 | 136 | } |
103 | 137 |
|
| 138 | + // Produce a correctly rounded DyadicFloat from a too-large mantissa, |
| 139 | + // by shifting it down and rounding if necessary. |
| 140 | + template <size_t MantissaBits> |
| 141 | + LIBC_INLINE constexpr static DyadicFloat<Bits> |
| 142 | + round(Sign result_sign, int result_exponent, |
| 143 | + const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa, |
| 144 | + size_t rshift) { |
| 145 | + MantissaType result_mantissa(input_mantissa >> rshift); |
| 146 | + if (need_to_round_up(input_mantissa, rshift, result_sign)) { |
| 147 | + ++result_mantissa; |
| 148 | + if (result_mantissa == 0) { |
| 149 | + // Rounding up made the mantissa integer wrap round to 0, |
| 150 | + // carrying a bit off the top. So we've rounded up to the next |
| 151 | + // exponent. |
| 152 | + result_mantissa.set_bit(Bits - 1); |
| 153 | + ++result_exponent; |
| 154 | + } |
| 155 | + } |
| 156 | + return DyadicFloat(result_sign, result_exponent, result_mantissa); |
| 157 | + } |
| 158 | + |
104 | 159 | #ifdef LIBC_TYPES_HAS_FLOAT16 |
105 | 160 | template <typename T, bool ShouldSignalExceptions> |
106 | 161 | LIBC_INLINE constexpr cpp::enable_if_t< |
@@ -374,6 +429,34 @@ template <size_t Bits> struct DyadicFloat { |
374 | 429 |
|
375 | 430 | return new_mant; |
376 | 431 | } |
| 432 | + |
| 433 | + LIBC_INLINE constexpr MantissaType |
| 434 | + as_mantissa_type_rounded(bool *overflowed = nullptr) const { |
| 435 | + if (mantissa.is_zero()) |
| 436 | + return 0; |
| 437 | + |
| 438 | + MantissaType new_mant = mantissa; |
| 439 | + if (exponent > 0) { |
| 440 | + new_mant <<= exponent; |
| 441 | + if (overflowed) |
| 442 | + *overflowed = (new_mant >> exponent) != mantissa; |
| 443 | + } else if (exponent < 0) { |
| 444 | + size_t shift = -exponent; |
| 445 | + new_mant >>= shift; |
| 446 | + if (need_to_round_up(mantissa, shift, sign)) |
| 447 | + ++new_mant; |
| 448 | + } |
| 449 | + |
| 450 | + if (sign.is_neg()) { |
| 451 | + new_mant = (~new_mant) + 1; |
| 452 | + } |
| 453 | + |
| 454 | + return new_mant; |
| 455 | + } |
| 456 | + |
| 457 | + LIBC_INLINE constexpr DyadicFloat operator-() const { |
| 458 | + return DyadicFloat(sign.negate(), exponent, mantissa); |
| 459 | + } |
377 | 460 | }; |
378 | 461 |
|
379 | 462 | // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the |
@@ -433,6 +516,12 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a, |
433 | 516 | return result.normalize(); |
434 | 517 | } |
435 | 518 |
|
| 519 | +template <size_t Bits> |
| 520 | +LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a, |
| 521 | + DyadicFloat<Bits> b) { |
| 522 | + return quick_add(a, -b); |
| 523 | +} |
| 524 | + |
436 | 525 | // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic |
437 | 526 | // floats with rounding toward 0 and then normalize the output: |
438 | 527 | // result.exponent = a.exponent + b.exponent + Bits, |
@@ -464,6 +553,96 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a, |
464 | 553 | return result; |
465 | 554 | } |
466 | 555 |
|
| 556 | +// Correctly rounded multiplication of 2 dyadic floats, assuming the |
| 557 | +// exponent remains within range. |
| 558 | +template <size_t Bits> |
| 559 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 560 | +rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) { |
| 561 | + using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>; |
| 562 | + Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; |
| 563 | + int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits); |
| 564 | + auto product = DblMant(a.mantissa) * DblMant(b.mantissa); |
| 565 | + // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero |
| 566 | + if (product.get_bit(2 * Bits - 1) == 0) { |
| 567 | + product <<= 1; |
| 568 | + result_exponent -= 1; |
| 569 | + } |
| 570 | + |
| 571 | + return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits); |
| 572 | +} |
| 573 | + |
| 574 | +// Approximate reciprocal - given a nonzero a, make a good approximation to 1/a. |
| 575 | +// The method is Newton-Raphson iteration, based on quick_mul. |
| 576 | +template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>> |
| 577 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 578 | +approx_reciprocal(const DyadicFloat<Bits> &a) { |
| 579 | + // Given an approximation x to 1/a, a better one is x' = x(2-ax). |
| 580 | + // |
| 581 | + // You can derive this by using the Newton-Raphson formula with the function |
| 582 | + // f(x) = 1/x - a. But another way to see that it works is to say: suppose |
| 583 | + // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) = |
| 584 | + // 1-e^2. So the error in x' is the square of the error in x, i.e. the number |
| 585 | + // of correct bits in x' is double the number in x. |
| 586 | + |
| 587 | + // An initial approximation to the reciprocal |
| 588 | + DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - Bits, |
| 589 | + uint64_t(0xFFFFFFFFFFFFFFFF) / |
| 590 | + static_cast<uint64_t>(a.mantissa >> (Bits - 32))); |
| 591 | + |
| 592 | + // The constant 2, which we'll need in every iteration |
| 593 | + DyadicFloat<Bits> two(Sign::POS, 1, 1); |
| 594 | + |
| 595 | + // We expect at least 31 correct bits from our 32-bit starting approximation |
| 596 | + size_t ok_bits = 31; |
| 597 | + |
| 598 | + // The number of good bits doubles in each iteration, except that rounding |
| 599 | + // errors introduce a little extra each time. Subtract a bit from our |
| 600 | + // accuracy assessment to account for that. |
| 601 | + while (ok_bits < Bits) { |
| 602 | + x = quick_mul(x, quick_sub(two, quick_mul(a, x))); |
| 603 | + ok_bits = 2 * ok_bits - 1; |
| 604 | + } |
| 605 | + |
| 606 | + return x; |
| 607 | +} |
| 608 | + |
| 609 | +// Correctly rounded division of 2 dyadic floats, assuming the |
| 610 | +// exponent remains within range. |
| 611 | +template <size_t Bits> |
| 612 | +LIBC_INLINE constexpr DyadicFloat<Bits> |
| 613 | +rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) { |
| 614 | + using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>; |
| 615 | + |
| 616 | + // Make an approximation to the quotient as a * (1/b). Both the |
| 617 | + // multiplication and the reciprocal are a bit sloppy, which doesn't |
| 618 | + // matter, because we're going to correct for that below. |
| 619 | + auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf)); |
| 620 | + |
| 621 | + // Switch to BigInt and stop using quick_add and quick_mul: now |
| 622 | + // we're working in exact integers so as to get the true remainder. |
| 623 | + DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa; |
| 624 | + q <<= 2; // leave room for a round bit, even if exponent decreases |
| 625 | + a <<= af.exponent - bf.exponent - qf.exponent + 2; |
| 626 | + DblMant qb = q * b; |
| 627 | + if (qb < a) { |
| 628 | + DblMant too_small = a - b; |
| 629 | + while (qb <= too_small) { |
| 630 | + qb += b; |
| 631 | + ++q; |
| 632 | + } |
| 633 | + } else { |
| 634 | + while (qb > a) { |
| 635 | + qb -= b; |
| 636 | + --q; |
| 637 | + } |
| 638 | + } |
| 639 | + |
| 640 | + DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q); |
| 641 | + auto qfinal = DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits, |
| 642 | + qbig.mantissa, Bits); |
| 643 | + return qfinal; |
| 644 | +} |
| 645 | + |
467 | 646 | // Simple polynomial approximation. |
468 | 647 | template <size_t Bits> |
469 | 648 | LIBC_INLINE constexpr DyadicFloat<Bits> |
|
0 commit comments