-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[libc][math] Improve performance of double precision trig functions. #111793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-libc Author: None (lntue) Changes
Patch is 85.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111793.diff 13 Files Affected:
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index 25a4ee03387c67..b51d3c4c63cf55 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -73,6 +73,26 @@ template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
return r;
}
+// Helper for non-fma exact mult where the first number is already splitted.
+template <bool NO_FMA_ALL_ROUNDINGS = false>
+LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
+ double b) {
+ DoubleDouble bs, r;
+
+ if constexpr (NO_FMA_ALL_ROUNDINGS)
+ bs = split<28>(b);
+ else
+ bs = split(b);
+
+ r.hi = a * b;
+ double t1 = as.hi * bs.hi - r.hi;
+ double t2 = as.hi * bs.lo + t1;
+ double t3 = as.lo * bs.hi + t2;
+ r.lo = as.lo * bs.lo + t3;
+
+ return r;
+}
+
// Note: When FMA instruction is not available, the `exact_mult` function is
// only correct for round-to-nearest mode. See:
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -90,18 +110,8 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
#else
// Dekker's Product.
DoubleDouble as = split(a);
- DoubleDouble bs;
- if constexpr (NO_FMA_ALL_ROUNDINGS)
- bs = split<28>(b);
- else
- bs = split(b);
-
- r.hi = a * b;
- double t1 = as.hi * bs.hi - r.hi;
- double t2 = as.hi * bs.lo + t1;
- double t3 = as.lo * bs.hi + t2;
- r.lo = as.lo * bs.lo + t3;
+ r = exact_mult<NO_FMA_ALL_ROUNDINGS>(as, a, b);
#endif // LIBC_TARGET_CPU_HAS_FMA
return r;
@@ -157,8 +167,8 @@ LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) {
double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
#else
- DoubleDouble b_hi_r_hi = fputil::exact_mult</*NO_FMA=*/true>(b.hi, -r.hi);
- DoubleDouble b_lo_r_hi = fputil::exact_mult</*NO_FMA=*/true>(b.lo, -r.hi);
+ DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
+ DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
#endif // LIBC_TARGET_CPU_HAS_FMA
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 5ffd474d35c54d..ce99f1ce6e5323 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -48,6 +48,16 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
#ifndef LIBC_MATH
#define LIBC_MATH 0
+#else
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#endif
+
+#if ((LIBC_MATH & LIBC_MATH_SMALL_TABLES) != 0)
+#define LIBC_MATH_HAS_SMALL_TABLES
+#endif
+
#endif // LIBC_MATH
#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
index e61d800ce2dada..fa3c489aa15ecb 100644
--- a/libc/src/math/generic/cos.cpp
+++ b/libc/src/math/generic/cos.cpp
@@ -17,17 +17,14 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"
-// TODO: We might be able to improve the performance of large range reduction of
-// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
-// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
-// those lookup table.
-#include "range_reduction_double_common.h"
-
-#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
-#define LIBC_MATH_COS_SKIP_ACCURATE_PASS
-#endif
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+#else
+#include "range_reduction_double_nofma.h"
+#endif // LIBC_TARGET_CPU_HAS_FMA
namespace LIBC_NAMESPACE_DECL {
@@ -42,22 +39,29 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
DoubleDouble y;
unsigned k;
- generic::LargeRangeReduction<NO_FMA> range_reduction_large{};
+ LargeRangeReduction range_reduction_large{};
- // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+ // |x| < 2^16.
if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
- // |x| < 2^-27
- if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
- // Signed zeros.
- if (LIBC_UNLIKELY(x == 0.0))
- return 1.0;
-
- // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
- return fputil::round_result_slightly_down(1.0);
+ // |x| < 2^-7
+ if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+ // |x| < 2^-27
+ if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+ // Signed zeros.
+ if (LIBC_UNLIKELY(x == 0.0))
+ return 1.0;
+
+ // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
+ return fputil::round_result_slightly_down(1.0);
+ }
+ // No range reduction needed.
+ k = 0;
+ y.lo = 0.0;
+ y.hi = x;
+ } else {
+ // Small range reduction.
+ k = range_reduction_small(x, y);
}
-
- // // Small range reduction.
- k = range_reduction_small(x, y);
} else {
// Inf or NaN
if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
@@ -70,45 +74,34 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
}
// Large range reduction.
- k = range_reduction_large.compute_high_part(x);
- y = range_reduction_large.fast();
+ k = range_reduction_large.fast(x, y);
}
DoubleDouble sin_y, cos_y;
- generic::sincos_eval(y, sin_y, cos_y);
+ [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y);
// Look up sin(k * pi/128) and cos(k * pi/128)
- // Memory saving versions:
-
- // Use 128-entry table instead:
- // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
- // uint64_t sin_s = static_cast<uint64_t>((k + 128) & 128) << (63 - 7);
- // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
- // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
- // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
- // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
- // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
- // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-
- // Use 64-entry table instead:
- // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
- // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
- // DoubleDouble ans = SIN_K_PI_OVER_128[idx];
- // if (kk & 128) {
- // ans.hi = -ans.hi;
- // ans.lo = -ans.lo;
- // }
- // return ans;
- // };
- // DoubleDouble sin_k = get_idx_dd(k + 128);
- // DoubleDouble cos_k = get_idx_dd(k + 64);
-
+#ifdef LIBC_MATH_HAS_SMALL_TABLES
+ // Memory saving versions. Use 65-entry table.
+ auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+ unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+ DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+ if (kk & 128) {
+ ans.hi = -ans.hi;
+ ans.lo = -ans.lo;
+ }
+ return ans;
+ };
+ DoubleDouble sin_k = get_idx_dd(k + 128);
+ DoubleDouble cos_k = get_idx_dd(k + 64);
+#else
// Fast look up version, but needs 256-entry table.
// -sin(k * pi/128) = sin((k + 128) * pi/128)
// cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255];
DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+#endif // LIBC_MATH_HAS_SMALL_TABLES
// After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
// So k is an integer and -pi / 256 <= y <= pi / 256.
@@ -120,20 +113,12 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
DoubleDouble rr = fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi);
rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
-#ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
return rr.hi + rr.lo;
#else
- // Accurate test and pass for correctly rounded implementation.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
- constexpr double ERR = 0x1.0p-70;
-#else
- // TODO: Improve non-FMA fast pass accuracy.
- constexpr double ERR = 0x1.0p-66;
-#endif // LIBC_TARGET_CPU_HAS_FMA
-
- double rlp = rr.lo + ERR;
- double rlm = rr.lo - ERR;
+ double rlp = rr.lo + err;
+ double rlm = rr.lo - err;
double r_upper = rr.hi + rlp; // (rr.lo + ERR);
double r_lower = rr.hi + rlm; // (rr.lo - ERR);
@@ -144,7 +129,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
Float128 u_f128, sin_u, cos_u;
if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
- u_f128 = generic::range_reduction_small_f128(x);
+ u_f128 = range_reduction_small_f128(x);
else
u_f128 = range_reduction_large.accurate();
@@ -152,7 +137,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
auto get_sin_k = [](unsigned kk) -> Float128 {
unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
- Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+ Float128 ans = SIN_K_PI_OVER_128_F128[idx];
if (kk & 128)
ans.sign = Sign::NEG;
return ans;
@@ -172,7 +157,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
// https://github.com/llvm/llvm-project/issues/96452.
return static_cast<double>(r);
-#endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index 290b642be4c69f..9469ecebc55802 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -17,150 +17,269 @@
#include "src/__support/common.h"
#include "src/__support/integer_literals.h"
#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-#include "range_reduction_double_fma.h"
-
-// With FMA, we limit the maxmimum exponent to be 2^16, so that the error bound
-// from the fma::range_reduction_small is bounded by 2^-88 instead of 2^-72.
-#define FAST_PASS_EXPONENT 16
-using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::fma::range_reduction_small;
-using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+namespace LIBC_NAMESPACE_DECL {
-LIBC_INLINE constexpr bool NO_FMA = false;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr bool NO_FMA = false;
#else
-#include "range_reduction_double_nofma.h"
+static constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
-using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
-using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::nofma::range_reduction_small;
-using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
-LIBC_INLINE constexpr bool NO_FMA = true;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#define FAST_PASS_EXPONENT 16
-namespace LIBC_NAMESPACE_DECL {
+// For 2^-7 < |x| < 2^16, return k and u such that:
+// k = round(x * 128/pi)
+// x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
+// Error bound:
+// |(x - k * pi/128) - (u_hi + u_lo)| <= max(ulp(ulp(u_hi)), 2^-119)
+// <= 2^-111.
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+ // Values of -pi/128 used for inputs with absolute value <= 2^16.
+ // The first 3 parts are generated with (53 - 21 = 32)-bit precision, so that
+ // the product k * MPI_OVER_128[i] is exact.
+ // Generated by Sollya with:
+ // > display = hexadecimal!;
+ // > a = round(pi/128, 32, RN);
+ // > b = round(pi/128 - a, 32, RN);
+ // > c = round(pi/128 - a - b, D, RN);
+ // > print(-a, ",", -b, ",", -c);
+ constexpr double MPI_OVER_128[3] = {-0x1.921fb544p-6, -0x1.0b4611a6p-40,
+ -0x1.3198a2e037073p-75};
+ constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5;
+ double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D;
+ double kd = fputil::nearest_integer(prod_hi);
-namespace generic {
+ // Let y = x - k * (pi/128)
+ // Then |y| < pi / 256
+ // With extra rounding errors, we can bound |y| < 1.6 * 2^-7.
+ double y_hi = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
+ // |u.hi| < 1.6*2^-7
+ u.hi = fputil::multiply_add(kd, MPI_OVER_128[1], y_hi);
+ double u0 = y_hi - u.hi; // Exact;
+ // |u.lo| <= max(ulp(u.hi), |kd * MPI_OVER_128[2]|)
+ double u1 = fputil::multiply_add(kd, MPI_OVER_128[1], u0); // Exact
+ u.lo = fputil::multiply_add(kd, MPI_OVER_128[2], u1);
+ // Error bound:
+ // |x - k * pi/128| - (u.hi + u.lo) <= ulp(u.lo)
+ // <= ulp(max(ulp(u.hi), kd*MPI_OVER_128[2]))
+ // <= 2^(-7 - 104) = 2^-111.
-using LIBC_NAMESPACE::fputil::DoubleDouble;
-using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
+ return static_cast<unsigned>(static_cast<int64_t>(kd));
+}
-LIBC_INLINE constexpr Float128 PI_OVER_128_F128 = {
- Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
+// Digits of 2^(16*i) / pi, generated by Sollya with:
+// > procedure ulp(x, n) { return 2^(floor(log2(abs(x))) - n); };
+// > for i from 0 to 63 do {
+// if i < 3 then { pi_inv = 0.25 + 2^(16*(i - 3)) / pi; }
+// else { pi_inv = 2^(16*(i-3)) / pi; };
+// pn = nearestint(pi_inv);
+// pi_frac = pi_inv - pn;
+// a = round(pi_frac, 51, RN);
+// b = round(pi_frac - a, 51, RN);
+// c = round(pi_frac - a - b, 51, RN);
+// d = round(pi_frac - a - b - c, D, RN);
+// print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+// };
+//
+// Notice that for [0..2] the leading bit of 2^(16*(i - 3)) / pi is very small,
+// so we add 0.25 so that the conditions for the algorithms are still satisfied,
+// and one of those conditions guarantees that ulp(0.25 * x_reduced) >= 2, and
+// will safely be discarded.
-// Note: The look-up tables ONE_TWENTY_EIGHT_OVER_PI is selected to be either
-// from fma:: or nofma:: namespace.
+static constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
+ {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9cp-101,
+ -0x1.911f924eb5336p-153},
+ {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101,
+ 0x1.b6c52b3278872p-155},
+ {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f925p-101,
+ 0x1.4acc9e21c821p-153},
+ {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4cp-99,
+ -0x1.b0ef1bef806bap-152},
+ {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361ep-101,
+ 0x1.c820ff28b1d5fp-153},
+ {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100,
+ 0x1.fe5163abdebbcp-154},
+ {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe5p-102,
+ 0x1.63abdebbc561bp-154},
+ {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d6p-105,
+ -0x1.0a21d4f246dc9p-157},
+ {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7bp-100,
+ -0x1.0ea79236e4717p-152},
+ {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561cp-106,
+ -0x1.1b7238b7b645ap-159},
+ {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db94p-101,
+ -0x1.c8e2ded9169p-153},
+ {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc8p-99,
+ -0x1.6f6c8b47fe6dbp-152},
+ {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb4p-102,
+ 0x1.d2e006492eea1p-154},
+ {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101,
+ 0x1.924bba8274648p-160},
+ {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dcp-104,
+ 0x1.77504e8c90e7fp-157},
+ {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eeap-106,
+ 0x1.3a32439fc3bd6p-159},
+ {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413cp-99,
+ -0x1.cdbc603c429c7p-151},
+ {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a32438p-103,
+ 0x1.fc3bd63962535p-155},
+ {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e4p-102,
+ 0x1.eb1cb129a73efp-154},
+ {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a7p-101,
+ -0x1.a76b2c608bbeep-153},
+ {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb128p-102,
+ 0x1.a73ee88235f53p-154},
+ {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a74p-102,
+ -0x1.177dca0ad144cp-154},
+ {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee884p-102,
+ -0x1.ca0ad144bb7b1p-154},
+ {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f54p-106,
+ -0x1.144bb7b16639p-158},
+ {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
+ 0x1.213a671c09ad1p-160},
+ {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2428p-105,
+ -0x1.6638fd94ba082p-158},
+ {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc8p-103,
+ 0x1.c09ad17df904ep-156},
+ {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f8p-100,
+ 0x1.ad17df904e647p-152},
+ {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17cp-104,
+ 0x1.f904e64758e61p-156},
+ {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100,
+ 0x1.3991d63983534p-154},
+ {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec67p-102,
+ 0x1.d639835339f4ap-154},
+ {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64759p-100,
+ -0x1.9f2b3182d8defp-152},
+ {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106,
+ -0x1.82d8dee81d108p-160},
+ {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3184p-100,
+ 0x1.272117e2ef7e5p-152},
+ {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e44p-101,
+ -0x1.d03a21036be27p-153},
+ {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102,
+ -0x1.081b5f13801dap-156},
+ {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840cp-103,
+ -0x1.af89c00ed0004p-155},
+ {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2508p-99,
+ -0x1.3801da00087eap-152},
+ {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ffp-103,
+ 0x1.2fffbc0b301fep-155},
+ {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801dap-104,
+ -0x1.0fd33f8086877p-157},
+ {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef04p-101,
+ -0x1.33f80868773a5p-153},
+ {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104,
+ -0x1.0d0ee74a5f593p-158},
+ {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086878p-109,
+ 0x1.8b5a0a6d1f6d3p-162},
+ {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462cp-104,
+ 0x1.6829b47db4dap-156},
+ {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f594p-110,
+ 0x1.1f6d367ecf27dp-162},
+ {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101,
+ -0x1.25930261b069fp-155},
+ {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6cp-103,
+ 0x1.fb3c9f2c26dd4p-156},
+ {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfdap-103,
+ -0x1.b069ec9161738p-155},
+ {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b08p-99,
+ 0x1.6136e9e8c7ecdp-151},
+ {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec9p-103,
+ -0x1.61738132c3403p-155},
+ {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585cp-101,
+ -0x1.c09961a015d29p-154},
+ {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7eccp-103,
+ 0x1.3cbfd45aea4f7p-155},
+ {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100,
+ -0x1.5d28ad8453814p-158},
+ {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b4p-104,
+ 0x1.d49eeb1faf97cp-156},
+ {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d48p-100,
+ 0x1.eeb1faf97c5edp-152},
+ {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eebp-100,
+ 0x1.faf97c5ecf41dp-152},
+ {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103,
+ -0x1.d0985f18c10ebp-159},
+ {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5ecp-104,
+ 0x1.e839cfbc...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
plz reformat |
|
nickdesaulniers
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just minor stylistic comments.
| *sin_x = x; | ||
| *cos_x = 1.0; | ||
| return; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps in the future, we can have one implementation for computing sincos that BOTH sin and cos use, where once inlined we can skip computing the unneeded result. Currently, there's a fair amount of duplication between sincos, sin, and cos that I think can be avoided. Not necessary for this PR, but worth a thought.
I usually do |
| // the generated constants to precision <= 51, and splitting it by 2^28 + 1, | ||
| // then a * b = r.hi + r.lo is exact for all rounding modes. | ||
| template <bool NO_FMA_ALL_ROUNDINGS = false> | ||
| template <size_t SPLIT_B = 27> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>| } | ||
|
|
||
| template <bool NO_FMA_ALL_ROUNDINGS = false> | ||
| template <size_t SPLIT_B = 27> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>…lvm#111793) - Improve the accuracy of fast pass' range reduction. - Provide tighter error estimations. - Reduce the table size when `LIBC_MATH_SMALL_TABLES` flag is set.
LIBC_MATH_SMALL_TABLESflag is set.