llvm · statham-arm · Feb 4, 2025 · Jan 17, 2025 · Jan 22, 2025 · Jan 27, 2025
diff --git a/libc/config/config.json b/libc/config/config.json
@@ -30,6 +30,10 @@
       "value": false,
       "doc": "Use the same mode for double and long double in printf."
     },
+    "LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320": {
+      "value": false,
+      "doc": "Use an alternative printf float implementation based on 320-bit floats"
+    },
     "LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
       "value": false,
       "doc": "Disable printing fixed point values in printf and friends."

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
@@ -43,6 +43,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_PRINTF_DISABLE_WRITE_INT``: Disable handling of %n in printf format string.
     - ``LIBC_CONF_PRINTF_FLOAT_TO_STR_NO_SPECIALIZE_LD``: Use the same mode for double and long double in printf.
     - ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_DYADIC_FLOAT``: Use dyadic float for faster and smaller but less accurate printf doubles.
+    - ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320``: Use an alternative printf float implementation based on 320-bit floats
     - ``LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE``: Use large table for better printf long double performance.
 * **"pthread" options**
     - ``LIBC_CONF_RAW_MUTEX_DEFAULT_SPIN_COUNT``: Default number of spins before blocking if a mutex is in contention (default to 100).

diff --git a/libc/src/__support/CPP/algorithm.h b/libc/src/__support/CPP/algorithm.h
@@ -26,6 +26,8 @@ template <class T> LIBC_INLINE constexpr const T &min(const T &a, const T &b) {
   return (a < b) ? a : b;
 }
 
+template <class T> LIBC_INLINE constexpr T abs(T a) { return a < 0 ? -a : a; }
+
 template <class InputIt, class UnaryPred>
 LIBC_INLINE constexpr InputIt find_if_not(InputIt first, InputIt last,
                                           UnaryPred q) {

diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
@@ -26,6 +26,49 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+// Decide whether to round a UInt up, down or not at all at a given bit
+// position, based on the current rounding mode. The assumption is that the
+// caller is going to make the integer `value >> rshift`, and then might need
+// to round it up by 1 depending on the value of the bits shifted off the
+// bottom.
+//
+// `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to
+// be reversed, which is what you'd want if this is the mantissa of a
+// negative floating-point number.
+//
+// Return value is +1 if the value should be rounded up; -1 if it should be
+// rounded down; 0 if it's exact and needs no rounding.
+template <size_t Bits>
+LIBC_INLINE constexpr int
+rounding_direction(const LIBC_NAMESPACE::UInt<Bits> &value, size_t rshift,
+                   Sign logical_sign) {
+  if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) ||
+      (rshift >= Bits && value == 0))
+    return 0; // exact
+
+  switch (quick_get_round()) {
+  case FE_TONEAREST:
+    if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) {
+      // We round up, unless the value is an exact halfway case and
+      // the bit that will end up in the units place is 0, in which
+      // case tie-break-to-even says round down.
+      return value.get_bit(rshift) != 0 || (value << (Bits - rshift + 1)) != 0
+                 ? +1
+                 : -1;
+    } else {
+      return -1;
+    }
+  case FE_TOWARDZERO:
+    return -1;
+  case FE_DOWNWARD:
+    return logical_sign.is_neg() && (value << (Bits - rshift)) != 0 ? +1 : -1;
+  case FE_UPWARD:
+    return logical_sign.is_pos() && (value << (Bits - rshift)) != 0 ? +1 : -1;
+  default:
+    __builtin_unreachable();
+  }
+}
+
 // A generic class to perform computations of high precision floating points.
 // We store the value in dyadic format, including 3 fields:
 //   sign    : boolean value - false means positive, true means negative
@@ -101,6 +144,27 @@ template <size_t Bits> struct DyadicFloat {
     return exponent + (Bits - 1);
   }
 
+  // Produce a correctly rounded DyadicFloat from a too-large mantissa,
+  // by shifting it down and rounding if necessary.
+  template <size_t MantissaBits>
+  LIBC_INLINE constexpr static DyadicFloat<Bits>
+  round(Sign result_sign, int result_exponent,
+        const LIBC_NAMESPACE::UInt<MantissaBits> &input_mantissa,
+        size_t rshift) {
+    MantissaType result_mantissa(input_mantissa >> rshift);
+    if (rounding_direction(input_mantissa, rshift, result_sign) > 0) {
+      ++result_mantissa;
+      if (result_mantissa == 0) {
+        // Rounding up made the mantissa integer wrap round to 0,
+        // carrying a bit off the top. So we've rounded up to the next
+        // exponent.
+        result_mantissa.set_bit(Bits - 1);
+        ++result_exponent;
+      }
+    }
+    return DyadicFloat(result_sign, result_exponent, result_mantissa);
+  }
+
 #ifdef LIBC_TYPES_HAS_FLOAT16
   template <typename T, bool ShouldSignalExceptions>
   LIBC_INLINE constexpr cpp::enable_if_t<
@@ -374,6 +438,39 @@ template <size_t Bits> struct DyadicFloat {
 
     return new_mant;
   }
+
+  LIBC_INLINE constexpr MantissaType
+  as_mantissa_type_rounded(int *round_dir_out = nullptr) const {
+    int round_dir = 0;
+    MantissaType new_mant;
+    if (mantissa.is_zero()) {
+      new_mant = 0;
+    } else {
+      new_mant = mantissa;
+      if (exponent > 0) {
+        new_mant <<= exponent;
+      } else if (exponent < 0) {
+        size_t shift = -exponent;
+        new_mant >>= shift;
+        round_dir = rounding_direction(mantissa, shift, sign);
+        if (round_dir > 0)
+          ++new_mant;
+      }
+
+      if (sign.is_neg()) {
+        new_mant = (~new_mant) + 1;
+      }
+    }
+
+    if (round_dir_out)
+      *round_dir_out = round_dir;
+
+    return new_mant;
+  }
+
+  LIBC_INLINE constexpr DyadicFloat operator-() const {
+    return DyadicFloat(sign.negate(), exponent, mantissa);
+  }
 };
 
 // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
@@ -433,6 +530,12 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
   return result.normalize();
 }
 
+template <size_t Bits>
+LIBC_INLINE constexpr DyadicFloat<Bits> quick_sub(DyadicFloat<Bits> a,
+                                                  DyadicFloat<Bits> b) {
+  return quick_add(a, -b);
+}
+
 // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
 // floats with rounding toward 0 and then normalize the output:
 //   result.exponent = a.exponent + b.exponent + Bits,
@@ -464,6 +567,96 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_mul(const DyadicFloat<Bits> &a,
   return result;
 }
 
+// Correctly rounded multiplication of 2 dyadic floats, assuming the
+// exponent remains within range.
+template <size_t Bits>
+LIBC_INLINE constexpr DyadicFloat<Bits>
+rounded_mul(const DyadicFloat<Bits> &a, const DyadicFloat<Bits> &b) {
+  using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>;
+  Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS;
+  int result_exponent = a.exponent + b.exponent + static_cast<int>(Bits);
+  auto product = DblMant(a.mantissa) * DblMant(b.mantissa);
+  // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero
+  if (product.get_bit(2 * Bits - 1) == 0) {
+    product <<= 1;
+    result_exponent -= 1;
+  }
+
+  return DyadicFloat<Bits>::round(result_sign, result_exponent, product, Bits);
+}
+
+// Approximate reciprocal - given a nonzero a, make a good approximation to 1/a.
+// The method is Newton-Raphson iteration, based on quick_mul.
+template <size_t Bits, typename = cpp::enable_if_t<(Bits >= 32)>>
+LIBC_INLINE constexpr DyadicFloat<Bits>
+approx_reciprocal(const DyadicFloat<Bits> &a) {
+  // Given an approximation x to 1/a, a better one is x' = x(2-ax).
+  //
+  // You can derive this by using the Newton-Raphson formula with the function
+  // f(x) = 1/x - a. But another way to see that it works is to say: suppose
+  // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) =
+  // 1-e^2. So the error in x' is the square of the error in x, i.e. the number
+  // of correct bits in x' is double the number in x.
+
+  // An initial approximation to the reciprocal
+  DyadicFloat<Bits> x(Sign::POS, -32 - a.exponent - Bits,
+                      uint64_t(0xFFFFFFFFFFFFFFFF) /
+                          static_cast<uint64_t>(a.mantissa >> (Bits - 32)));
+
+  // The constant 2, which we'll need in every iteration
+  DyadicFloat<Bits> two(Sign::POS, 1, 1);
+
+  // We expect at least 31 correct bits from our 32-bit starting approximation
+  size_t ok_bits = 31;
+
+  // The number of good bits doubles in each iteration, except that rounding
+  // errors introduce a little extra each time. Subtract a bit from our
+  // accuracy assessment to account for that.
+  while (ok_bits < Bits) {
+    x = quick_mul(x, quick_sub(two, quick_mul(a, x)));
+    ok_bits = 2 * ok_bits - 1;
+  }
+
+  return x;
+}
+
+// Correctly rounded division of 2 dyadic floats, assuming the
+// exponent remains within range.
+template <size_t Bits>
+LIBC_INLINE constexpr DyadicFloat<Bits>
+rounded_div(const DyadicFloat<Bits> &af, const DyadicFloat<Bits> &bf) {
+  using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>;
+
+  // Make an approximation to the quotient as a * (1/b). Both the
+  // multiplication and the reciprocal are a bit sloppy, which doesn't
+  // matter, because we're going to correct for that below.
+  auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf));
+
+  // Switch to BigInt and stop using quick_add and quick_mul: now
+  // we're working in exact integers so as to get the true remainder.
+  DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa;
+  q <<= 2; // leave room for a round bit, even if exponent decreases
+  a <<= af.exponent - bf.exponent - qf.exponent + 2;
+  DblMant qb = q * b;
+  if (qb < a) {
+    DblMant too_small = a - b;
+    while (qb <= too_small) {
+      qb += b;
+      ++q;
+    }
+  } else {
+    while (qb > a) {
+      qb -= b;
+      --q;
+    }
+  }
+
+  DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q);
+  auto qfinal = DyadicFloat<Bits>::round(qbig.sign, qbig.exponent + Bits,
+                                         qbig.mantissa, Bits);
+  return qfinal;
+}
+
 // Simple polynomial approximation.
 template <size_t Bits>
 LIBC_INLINE constexpr DyadicFloat<Bits>

diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
@@ -936,6 +936,18 @@ struct BigInt {
   // Return the i-th word of the number.
   LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
 
+  // Return the i-th bit of the number.
+  LIBC_INLINE constexpr bool get_bit(size_t i) const {
+    const size_t word_index = i / WORD_SIZE;
+    return 1 & (val[word_index] >> (i % WORD_SIZE));
+  }
+
+  // Set the i-th bit of the number.
+  LIBC_INLINE constexpr void set_bit(size_t i) {
+    const size_t word_index = i / WORD_SIZE;
+    val[word_index] |= WordType(1) << (i % WORD_SIZE);
+  }
+
 private:
   LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
     constexpr auto compare = [](WordType a, WordType b) {
@@ -968,7 +980,7 @@ struct BigInt {
   }
 
   LIBC_INLINE constexpr void decrement() {
-    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+    multiword::sub_with_borrow(val, cpp::array<WordType, 1>{1});
   }
 
   LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
@@ -989,12 +1001,6 @@ struct BigInt {
   LIBC_INLINE constexpr void clear_msb() {
     val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
   }
-
-  LIBC_INLINE constexpr void set_bit(size_t i) {
-    const size_t word_index = i / WORD_SIZE;
-    val[word_index] |= WordType(1) << (i % WORD_SIZE);
-  }
-
   LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
                                                         const BigInt &divider) {
     BigInt remainder = dividend;

diff --git a/libc/src/__support/sign.h b/libc/src/__support/sign.h
@@ -29,6 +29,8 @@ struct Sign {
   static const Sign POS;
   static const Sign NEG;
 
+  LIBC_INLINE constexpr Sign negate() const { return Sign(!is_negative); }
+
 private:
   LIBC_INLINE constexpr explicit Sign(bool is_negative)
       : is_negative(is_negative) {}

diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -16,6 +16,9 @@ endif()
 if(LIBC_CONF_PRINTF_FLOAT_TO_STR_NO_SPECIALIZE_LD)
   list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_NO_SPECIALIZE_LD")
 endif()
+if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_FLOAT320)
+  list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_FLOAT320")
+endif()
 if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
   list(APPEND printf_config_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
 endif()

diff --git a/libc/src/stdio/printf_core/converter_atlas.h b/libc/src/stdio/printf_core/converter_atlas.h
@@ -26,7 +26,11 @@
 // defines convert_float_decimal
 // defines convert_float_dec_exp
 // defines convert_float_dec_auto
+#ifdef LIBC_COPT_FLOAT_TO_STR_USE_FLOAT320
+#include "src/stdio/printf_core/float_dec_converter_limited.h"
+#else
 #include "src/stdio/printf_core/float_dec_converter.h"
+#endif
 // defines convert_float_hex_exp
 #include "src/stdio/printf_core/float_hex_converter.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT