1717#include " src/__support/macros/config.h"
1818#include " src/__support/macros/optimization.h" // LIBC_UNLIKELY
1919#include " src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
20+ #include " src/math/generic/range_reduction_double_common.h"
2021#include " src/math/generic/sincos_eval.h"
2122
22- // TODO: We might be able to improve the performance of large range reduction of
23- // non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
24- // pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
25- // those lookup table.
26- #include " range_reduction_double_common.h"
27-
28- #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
29- #define LIBC_MATH_COS_SKIP_ACCURATE_PASS
30- #endif
23+ #ifdef LIBC_TARGET_CPU_HAS_FMA
24+ #include " range_reduction_double_fma.h"
25+ #else
26+ #include " range_reduction_double_nofma.h"
27+ #endif // LIBC_TARGET_CPU_HAS_FMA
3128
3229namespace LIBC_NAMESPACE_DECL {
3330
@@ -42,22 +39,29 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
4239
4340 DoubleDouble y;
4441 unsigned k;
45- generic:: LargeRangeReduction<NO_FMA> range_reduction_large{};
42+ LargeRangeReduction range_reduction_large{};
4643
47- // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
44+ // |x| < 2^16.
4845 if (LIBC_LIKELY (x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
49- // |x| < 2^-27
50- if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 27 )) {
51- // Signed zeros.
52- if (LIBC_UNLIKELY (x == 0.0 ))
53- return 1.0 ;
54-
55- // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
56- return fputil::round_result_slightly_down (1.0 );
46+ // |x| < 2^-7
47+ if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 7 )) {
48+ // |x| < 2^-27
49+ if (LIBC_UNLIKELY (x_e < FPBits::EXP_BIAS - 27 )) {
50+ // Signed zeros.
51+ if (LIBC_UNLIKELY (x == 0.0 ))
52+ return 1.0 ;
53+
54+ // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
55+ return fputil::round_result_slightly_down (1.0 );
56+ }
57+ // No range reduction needed.
58+ k = 0 ;
59+ y.lo = 0.0 ;
60+ y.hi = x;
61+ } else {
62+ // Small range reduction.
63+ k = range_reduction_small (x, y);
5764 }
58-
59- // // Small range reduction.
60- k = range_reduction_small (x, y);
6165 } else {
6266 // Inf or NaN
6367 if (LIBC_UNLIKELY (x_e > 2 * FPBits::EXP_BIAS)) {
@@ -70,70 +74,51 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
7074 }
7175
7276 // Large range reduction.
73- k = range_reduction_large.compute_high_part (x);
74- y = range_reduction_large.fast ();
77+ k = range_reduction_large.fast (x, y);
7578 }
7679
7780 DoubleDouble sin_y, cos_y;
7881
79- generic::sincos_eval (y, sin_y, cos_y);
82+ [[maybe_unused]] double err = generic::sincos_eval (y, sin_y, cos_y);
8083
8184 // Look up sin(k * pi/128) and cos(k * pi/128)
82- // Memory saving versions:
83-
84- // Use 128-entry table instead:
85- // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
86- // uint64_t sin_s = static_cast<uint64_t>((k + 128) & 128) << (63 - 7);
87- // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
88- // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
89- // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
90- // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
91- // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
92- // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
93-
94- // Use 64-entry table instead:
95- // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
96- // unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
97- // DoubleDouble ans = SIN_K_PI_OVER_128[idx];
98- // if (kk & 128) {
99- // ans.hi = -ans.hi;
100- // ans.lo = -ans.lo;
101- // }
102- // return ans;
103- // };
104- // DoubleDouble sin_k = get_idx_dd(k + 128);
105- // DoubleDouble cos_k = get_idx_dd(k + 64);
106-
85+ #ifdef LIBC_MATH_HAS_SMALL_TABLES
86+ // Memory saving versions. Use 65-entry table.
87+ auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
88+ unsigned idx = (kk & 64 ) ? 64 - (kk & 63 ) : (kk & 63 );
89+ DoubleDouble ans = SIN_K_PI_OVER_128[idx];
90+ if (kk & 128 ) {
91+ ans.hi = -ans.hi ;
92+ ans.lo = -ans.lo ;
93+ }
94+ return ans;
95+ };
96+ DoubleDouble sin_k = get_idx_dd (k + 128 );
97+ DoubleDouble cos_k = get_idx_dd (k + 64 );
98+ #else
10799 // Fast look up version, but needs 256-entry table.
108100 // -sin(k * pi/128) = sin((k + 128) * pi/128)
109101 // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
110102 DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128 ) & 255 ];
111103 DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64 ) & 255 ];
104+ #endif // LIBC_MATH_HAS_SMALL_TABLES
112105
113106 // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
114107 // So k is an integer and -pi / 256 <= y <= pi / 256.
115108 // Then cos(x) = cos((k * pi/128 + y)
116109 // = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
117- DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA> (cos_y, cos_k);
118- DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA> (sin_y, msin_k);
110+ DoubleDouble cos_k_cos_y = fputil::quick_mult (cos_y, cos_k);
111+ DoubleDouble msin_k_sin_y = fputil::quick_mult (sin_y, msin_k);
119112
120113 DoubleDouble rr = fputil::exact_add<false >(cos_k_cos_y.hi , msin_k_sin_y.hi );
121114 rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo ;
122115
123- #ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS
116+ #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
124117 return rr.hi + rr.lo ;
125118#else
126119
127- // Accurate test and pass for correctly rounded implementation.
128- #ifdef LIBC_TARGET_CPU_HAS_FMA
129- constexpr double ERR = 0x1 .0p-70 ;
130- #else
131- // TODO: Improve non-FMA fast pass accuracy.
132- constexpr double ERR = 0x1 .0p-66 ;
133- #endif // LIBC_TARGET_CPU_HAS_FMA
134-
135- double rlp = rr.lo + ERR;
136- double rlm = rr.lo - ERR;
120+ double rlp = rr.lo + err;
121+ double rlm = rr.lo - err;
137122
138123 double r_upper = rr.hi + rlp; // (rr.lo + ERR);
139124 double r_lower = rr.hi + rlm; // (rr.lo - ERR);
@@ -144,15 +129,15 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
144129
145130 Float128 u_f128, sin_u, cos_u;
146131 if (LIBC_LIKELY (x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
147- u_f128 = generic:: range_reduction_small_f128 (x);
132+ u_f128 = range_reduction_small_f128 (x);
148133 else
149134 u_f128 = range_reduction_large.accurate ();
150135
151136 generic::sincos_eval (u_f128, sin_u, cos_u);
152137
153138 auto get_sin_k = [](unsigned kk) -> Float128 {
154139 unsigned idx = (kk & 64 ) ? 64 - (kk & 63 ) : (kk & 63 );
155- Float128 ans = generic:: SIN_K_PI_OVER_128_F128[idx];
140+ Float128 ans = SIN_K_PI_OVER_128_F128[idx];
156141 if (kk & 128 )
157142 ans.sign = Sign::NEG;
158143 return ans;
@@ -172,7 +157,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
172157 // https://github.com/llvm/llvm-project/issues/96452.
173158
174159 return static_cast <double >(r);
175- #endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS
160+ #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
176161}
177162
178163} // namespace LIBC_NAMESPACE_DECL
0 commit comments