|
| 1 | +//===-- Compute sin + cos for small angles ----------------------*- C++ -*-===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H |
| 10 | +#define LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H |
| 11 | + |
| 12 | +#include "src/__support/FPUtil/FEnvImpl.h" |
| 13 | +#include "src/__support/FPUtil/FPBits.h" |
| 14 | +#include "src/__support/FPUtil/double_double.h" |
| 15 | +#include "src/__support/FPUtil/multiply_add.h" |
| 16 | +#include "src/__support/FPUtil/nearest_integer.h" |
| 17 | +#include "src/__support/macros/config.h" |
| 18 | + |
| 19 | +namespace LIBC_NAMESPACE_DECL { |
| 20 | + |
| 21 | +namespace math { |
| 22 | + |
| 23 | +namespace sincosf_float_eval { |
| 24 | + |
| 25 | +// Since the worst case of `x mod pi` in single precision is > 2^-28, in order |
| 26 | +// to be bounded by 1 ULP, the range reduction accuracy will need to be at |
| 27 | +// least 2^(-28 - 23) = 2^-51. |
| 28 | +// For fast small range reduction, we will compute as follow: |
| 29 | +// Let pi ~ c0 + c1 + c2 |
| 30 | +// with |c1| < ulp(c0)/2 and |c2| < ulp(c1)/2 |
| 31 | +// then: |
| 32 | +// k := nearest_int(x * 1/pi); |
| 33 | +// u = (x - k * c0) - k * c1 - k * c2 |
| 34 | +// We requires k * c0, k * c1 to be exactly representable in single precision. |
| 35 | +// Let p_k be the precision of k, then the precision of c0 and c1 are: |
| 36 | +// 24 - p_k, |
| 37 | +// and the ulp of (k * c2) is 2^(-3 * (24 - p_k)). |
| 38 | +// This give us the following bound on the precision of k: |
| 39 | +// 3 * (24 - p_k) >= 51, |
| 40 | +// or equivalently: |
| 41 | +// p_k <= 7. |
| 42 | +// We set the bound for p_k to be 6 so that we can have some more wiggle room |
| 43 | +// for computations. |
| 44 | +LIBC_INLINE static unsigned sincosf_range_reduction_small(float x, float &u) { |
| 45 | + // > display=hexadecimal; |
| 46 | + // > a = round(pi/8, 18, RN); |
| 47 | + // > b = round(pi/8 - a, 18, RN); |
| 48 | + // > c = round(pi/8 - a - b, SG, RN); |
| 49 | + // > round(8/pi, SG, RN); |
| 50 | + constexpr float MPI[3] = {-0x1.921f8p-2f, -0x1.aa22p-21f, -0x1.68c234p-41f}; |
| 51 | + constexpr float ONE_OVER_PI = 0x1.45f306p+1f; |
| 52 | + float prod_hi = x * ONE_OVER_PI; |
| 53 | + float k = fputil::nearest_integer(prod_hi); |
| 54 | + |
| 55 | + float y_hi = fputil::multiply_add(k, MPI[0], x); // Exact |
| 56 | + u = fputil::multiply_add(k, MPI[1], y_hi); |
| 57 | + u = fputil::multiply_add(k, MPI[2], u); |
| 58 | + return static_cast<unsigned>(static_cast<int>(k)); |
| 59 | +} |
| 60 | + |
| 61 | +// TODO: Add non-FMA version of large range reduction. |
| 62 | +LIBC_INLINE static unsigned sincosf_range_reduction_large(float x, float &u) { |
| 63 | + // > for i from 0 to 13 do { |
| 64 | + // if i < 2 then { pi_inv = 0.25 + 2^(8*(i - 2)) / pi; } |
| 65 | + // else { pi_inv = 2^(8*(i-2)) / pi; }; |
| 66 | + // pn = nearestint(pi_inv); |
| 67 | + // pi_frac = pi_inv - pn; |
| 68 | + // a = round(pi_frac, SG, RN); |
| 69 | + // b = round(pi_frac - a, SG, RN); |
| 70 | + // c = round(pi_frac - a - b, SG, RN); |
| 71 | + // d = round(pi_frac - a - b - c, SG, RN); |
| 72 | + // print("{", 2^3 * a, ",", 2^3 * b, ",", 2^3 * c, ",", 2^3 * d, "},"); |
| 73 | + // }; |
| 74 | + constexpr float EIGHT_OVER_PI[14][4] = { |
| 75 | + {0x1.000146p1f, -0x1.9f246cp-28f, -0x1.bbead6p-54f, -0x1.ec5418p-85f}, |
| 76 | + {0x1.0145f4p1f, -0x1.f246c6p-24f, -0x1.df56bp-49f, -0x1.ec5418p-77f}, |
| 77 | + {0x1.45f306p1f, 0x1.b9391p-24f, 0x1.529fc2p-50f, 0x1.d5f47ep-76f}, |
| 78 | + {0x1.f306dcp1f, 0x1.391054p-24f, 0x1.4fe13ap-49f, 0x1.7d1f54p-74f}, |
| 79 | + {-0x1.f246c6p0f, -0x1.df56bp-25f, -0x1.ec5418p-53f, 0x1.f534dep-78f}, |
| 80 | + {-0x1.236378p1f, 0x1.529fc2p-26f, 0x1.d5f47ep-52f, -0x1.65912p-77f}, |
| 81 | + {0x1.391054p0f, 0x1.4fe13ap-25f, 0x1.7d1f54p-50f, -0x1.6447e4p-75f}, |
| 82 | + {0x1.1054a8p0f, -0x1.ec5418p-29f, 0x1.f534dep-54f, -0x1.f924ecp-81f}, |
| 83 | + {0x1.529fc2p-2f, 0x1.d5f47ep-28f, -0x1.65912p-53f, 0x1.b6c52cp-79f}, |
| 84 | + {-0x1.ac07b2p1f, 0x1.5f47d4p-24f, 0x1.a6ee06p-49f, 0x1.b6295ap-74f}, |
| 85 | + {-0x1.ec5418p-5f, 0x1.f534dep-30f, -0x1.f924ecp-57f, 0x1.5993c4p-82f}, |
| 86 | + {0x1.3abe9p-1f, -0x1.596448p-27f, 0x1.b6c52cp-55f, -0x1.9b0ef2p-80f}, |
| 87 | + {-0x1.505c16p1f, 0x1.a6ee06p-25f, 0x1.b6295ap-50f, -0x1.b0ef1cp-76f}, |
| 88 | + {-0x1.70565ap-1f, 0x1.dc0db6p-26f, 0x1.4acc9ep-53f, 0x1.0e4108p-80f}, |
| 89 | + }; |
| 90 | + |
| 91 | + using FPBits = typename fputil::FPBits<float>; |
| 92 | + using fputil::FloatFloat; |
| 93 | + FPBits xbits(x); |
| 94 | + |
| 95 | + int x_e_m32 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 32); |
| 96 | + unsigned idx = static_cast<unsigned>((x_e_m32 >> 3) + 2); |
| 97 | + // Scale x down by 2^(-(8 * (idx - 2)) |
| 98 | + xbits.set_biased_exponent((x_e_m32 & 7) + FPBits::EXP_BIAS + 32); |
| 99 | + // 2^32 <= |x_reduced| < 2^(32 + 8) = 2^40 |
| 100 | + float x_reduced = xbits.get_val(); |
| 101 | + // x * c_hi = ph.hi + ph.lo exactly. |
| 102 | + FloatFloat ph = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][0]); |
| 103 | + // x * c_mid = pm.hi + pm.lo exactly. |
| 104 | + FloatFloat pm = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][1]); |
| 105 | + // x * c_lo = pl.hi + pl.lo exactly. |
| 106 | + FloatFloat pl = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][2]); |
| 107 | + // Extract integral parts and fractional parts of (ph.lo + pm.hi). |
| 108 | + float sum_hi = ph.lo + pm.hi; |
| 109 | + float k = fputil::nearest_integer(sum_hi); |
| 110 | + |
| 111 | + // x * 8/pi mod 1 ~ y_hi + y_mid + y_lo |
| 112 | + float y_hi = (ph.lo - k) + pm.hi; // Exact |
| 113 | + FloatFloat y_mid = fputil::exact_add(pm.lo, pl.hi); |
| 114 | + float y_lo = pl.lo; |
| 115 | + |
| 116 | + // y_l = x * c_lo_2 + pl.lo |
| 117 | + float y_l = fputil::multiply_add(x_reduced, EIGHT_OVER_PI[idx][3], y_lo); |
| 118 | + FloatFloat y = fputil::exact_add(y_hi, y_mid.hi); |
| 119 | + y.lo += (y_mid.lo + y_l); |
| 120 | + |
| 121 | + // Digits of pi/8, generated by Sollya with: |
| 122 | + // > a = round(pi/8, SG, RN); |
| 123 | + // > b = round(pi/8 - SG, D, RN); |
| 124 | + constexpr FloatFloat PI_OVER_8 = {-0x1.777a5cp-27f, 0x1.921fb6p-2f}; |
| 125 | + |
| 126 | + // Error bound: with {a} denote the fractional part of a, i.e.: |
| 127 | + // {a} = a - round(a) |
| 128 | + // Then, |
| 129 | + // | {x * 8/pi} - (y_hi + y_lo) | <= ulp(ulp(y_hi)) <= 2^-47 |
| 130 | + // | {x mod pi/8} - (u.hi + u.lo) | < 2 * 2^-5 * 2^-47 = 2^-51 |
| 131 | + u = fputil::multiply_add(y.hi, PI_OVER_8.hi, y.lo * PI_OVER_8.hi); |
| 132 | + |
| 133 | + return static_cast<unsigned>(static_cast<int>(k)); |
| 134 | +} |
| 135 | + |
| 136 | +template <bool IS_SIN> LIBC_INLINE static float sincosf_eval(float x) { |
| 137 | + // sin(k * pi/8) for k = 0..15, generated by Sollya with: |
| 138 | + // > for k from 0 to 16 do { |
| 139 | + // print(round(sin(k * pi/8), SG, RN)); |
| 140 | + // }; |
| 141 | + constexpr float SIN_K_PI_OVER_8[16] = { |
| 142 | + 0.0f, 0x1.87de2ap-2f, 0x1.6a09e6p-1f, 0x1.d906bcp-1f, |
| 143 | + 1.0f, 0x1.d906bcp-1f, 0x1.6a09e6p-1f, 0x1.87de2ap-2f, |
| 144 | + 0.0f, -0x1.87de2ap-2f, -0x1.6a09e6p-1f, -0x1.d906bcp-1f, |
| 145 | + -1.0f, -0x1.d906bcp-1f, -0x1.6a09e6p-1f, -0x1.87de2ap-2f, |
| 146 | + }; |
| 147 | + |
| 148 | + using FPBits = fputil::FPBits<float>; |
| 149 | + FPBits xbits(x); |
| 150 | + uint32_t x_abs = cpp::bit_cast<uint32_t>(x) & 0x7fff'ffffU; |
| 151 | + |
| 152 | + float y; |
| 153 | + unsigned k = 0; |
| 154 | + if (x_abs < 0x4880'0000U) { |
| 155 | + k = sincosf_range_reduction_small(x, y); |
| 156 | + } else { |
| 157 | + |
| 158 | + if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { |
| 159 | + if (xbits.is_signaling_nan()) { |
| 160 | + fputil::raise_except_if_required(FE_INVALID); |
| 161 | + return FPBits::quiet_nan().get_val(); |
| 162 | + } |
| 163 | + |
| 164 | + if (x_abs == 0x7f80'0000U) { |
| 165 | + fputil::set_errno_if_required(EDOM); |
| 166 | + fputil::raise_except_if_required(FE_INVALID); |
| 167 | + } |
| 168 | + return x + FPBits::quiet_nan().get_val(); |
| 169 | + } |
| 170 | + |
| 171 | + k = sincosf_range_reduction_large(x, y); |
| 172 | + } |
| 173 | + |
| 174 | + float sin_k = SIN_K_PI_OVER_8[k & 15]; |
| 175 | + // cos(k * pi/8) = sin(k * pi/8 + pi/2) = sin((k + 4) * pi/8). |
| 176 | + // cos_k = cos(k * pi/8) |
| 177 | + float cos_k = SIN_K_PI_OVER_8[(k + 4) & 15]; |
| 178 | + |
| 179 | + float y_sq = y * y; |
| 180 | + |
| 181 | + // Polynomial approximation of sin(y) and cos(y) for |y| <= pi/16: |
| 182 | + // |
| 183 | + // Using Taylor polynomial for sin(y): |
| 184 | + // sin(y) ~ y - y^3 / 6 + y^5 / 120 |
| 185 | + // Using minimax polynomial generated by Sollya for cos(y) with: |
| 186 | + // > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]); |
| 187 | + // |
| 188 | + // Error bounds: |
| 189 | + // * For sin(y) |
| 190 | + // > P = x - SG(1/6)*x^3 + SG(1/120) * x^5; |
| 191 | + // > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]); |
| 192 | + // 0x1.825...p-27 |
| 193 | + // * For cos(y) |
| 194 | + // > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]); |
| 195 | + // > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]); |
| 196 | + // 0x1.aa8...p-29 |
| 197 | + |
| 198 | + // p1 = y^2 * 1/120 - 1/6 |
| 199 | + float p1 = fputil::multiply_add(y_sq, 0x1.111112p-7f, -0x1.555556p-3f); |
| 200 | + // q1 = y^2 * coeff(Q, 4) + coeff(Q, 2) |
| 201 | + float q1 = fputil::multiply_add(y_sq, 0x1.54b8bep-5f, -0x1.ffffc4p-2f); |
| 202 | + float y3 = y_sq * y; |
| 203 | + // c1 ~ cos(y) |
| 204 | + float c1 = fputil::multiply_add(y_sq, q1, 1.0f); |
| 205 | + // s1 ~ sin(y) |
| 206 | + float s1 = fputil::multiply_add(y3, p1, y); |
| 207 | + |
| 208 | + if constexpr (IS_SIN) { |
| 209 | + // sin(x) = cos(k * pi/8) * sin(y) + sin(k * pi/8) * cos(y). |
| 210 | + return fputil::multiply_add(cos_k, s1, sin_k * c1); |
| 211 | + } else { |
| 212 | + // cos(x) = cos(k * pi/8) * cos(y) - sin(k * pi/8) * sin(y). |
| 213 | + return fputil::multiply_add(cos_k, c1, -sin_k * s1); |
| 214 | + } |
| 215 | +} |
| 216 | + |
| 217 | +} // namespace sincosf_float_eval |
| 218 | + |
| 219 | +} // namespace math |
| 220 | + |
| 221 | +} // namespace LIBC_NAMESPACE_DECL |
| 222 | + |
| 223 | +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H |
0 commit comments