Skip to content

Commit 513f9c5

Browse files
lntuepetrhosek
andauthored
[libc][math] Add float-only implementation for sinf / cosf. (#161680)
Based on the double precision's sin/cos fast path algorithm: Step 1: Perform range reduction `y = x mod pi/8` with target errors < 2^-54. This is because the worst case mod pi/8 for single precision is ~2^-31, so to have up to 1 ULP errors from the range reduction, the targeted errors should `be 2^(-31 - 23) = 2^-54`. Step 2: Polynomial approximation We use degree-5 and degree-4 polynomials to approximate sin and cos of the reduced angle respectively. Step 3: Combine the results using trig identities ```math \begin{align*} \sin(x) &= \sin(y) \cdot \cos(k \cdot \frac{\pi}{8}) + \cos(y) \cdot \sin(k \cdot \frac{\pi}{8}) \\ \cos(x) &= \cos(y) \cdot \cos(k \cdot \frac{\pi}{8}) - \sin(y) \cdot \sin(k \cdot \frac{\pi}{8}) \end{align*} ``` Overall errors: <= 3 ULPs for default rounding modes (tested exhaustively). Current limitation: large range reduction requires FMA instructions for binary32. This restriction will be removed in the followup PR. --------- Co-authored-by: Petr Hosek <[email protected]>
1 parent 4a1ea3e commit 513f9c5

File tree

13 files changed

+505
-16
lines changed

13 files changed

+505
-16
lines changed

libc/src/__support/FPUtil/double_double.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,9 @@ LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
144144
return r;
145145
}
146146

147-
LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
148-
DoubleDouble r = exact_mult(a, b.hi);
147+
template <typename T = double>
148+
LIBC_INLINE NumberPair<T> quick_mult(T a, const NumberPair<T> &b) {
149+
NumberPair<T> r = exact_mult(a, b.hi);
149150
r.lo = multiply_add(a, b.lo, r.lo);
150151
return r;
151152
}

libc/src/__support/math/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,7 @@ add_header_library(
926926
sincosf_utils
927927
HDRS
928928
sincosf_utils.h
929+
sincosf_float_eval.h
929930
DEPENDS
930931
.range_reduction
931932
libc.src.__support.FPUtil.fp_bits

libc/src/__support/math/cosf.h

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#ifndef LIBC_SRC___SUPPORT_MATH_COSF_H
1010
#define LIBC_SRC___SUPPORT_MATH_COSF_H
1111

12-
#include "sincosf_utils.h"
1312
#include "src/__support/FPUtil/FEnvImpl.h"
1413
#include "src/__support/FPUtil/FPBits.h"
1514
#include "src/__support/FPUtil/except_value_utils.h"
@@ -18,6 +17,26 @@
1817
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
1918
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
2019

20+
#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) && \
21+
defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) && \
22+
defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
23+
24+
#include "sincosf_float_eval.h"
25+
26+
namespace LIBC_NAMESPACE_DECL {
27+
namespace math {
28+
29+
LIBC_INLINE static constexpr float cosf(float x) {
30+
return sincosf_float_eval::sincosf_eval</*IS_SIN*/ false>(x);
31+
}
32+
33+
} // namespace math
34+
} // namespace LIBC_NAMESPACE_DECL
35+
36+
#else // !LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
37+
38+
#include "sincosf_utils.h"
39+
2140
namespace LIBC_NAMESPACE_DECL {
2241

2342
namespace math {
@@ -51,7 +70,6 @@ LIBC_INLINE static constexpr float cosf(float x) {
5170
xbits.set_sign(Sign::POS);
5271

5372
uint32_t x_abs = xbits.uintval();
54-
double xd = static_cast<double>(xbits.get_val());
5573

5674
// Range reduction:
5775
// For |x| > pi/16, we perform range reduction as follows:
@@ -90,6 +108,7 @@ LIBC_INLINE static constexpr float cosf(float x) {
90108
// computed using degree-7 and degree-6 minimax polynomials generated by
91109
// Sollya respectively.
92110

111+
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
93112
// |x| < 0x1.0p-12f
94113
if (LIBC_UNLIKELY(x_abs < 0x3980'0000U)) {
95114
// When |x| < 2^-12, the relative error of the approximation cos(x) ~ 1
@@ -108,12 +127,12 @@ LIBC_INLINE static constexpr float cosf(float x) {
108127
// emulated version of FMA.
109128
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
110129
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
111-
#else
130+
#else // !LIBC_TARGET_CPU_HAS_FMA_FLOAT
131+
double xd = static_cast<double>(xbits.get_val());
112132
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
113133
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
114134
}
115135

116-
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
117136
if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
118137
return r.value();
119138
#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
@@ -132,6 +151,7 @@ LIBC_INLINE static constexpr float cosf(float x) {
132151
return x + FPBits::quiet_nan().get_val();
133152
}
134153

154+
double xd = static_cast<double>(xbits.get_val());
135155
// Combine the results with the sine of sum formula:
136156
// cos(x) = cos((k + y)*pi/32)
137157
// = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
@@ -150,3 +170,5 @@ LIBC_INLINE static constexpr float cosf(float x) {
150170
} // namespace LIBC_NAMESPACE_DECL
151171

152172
#endif // LIBC_SRC___SUPPORT_MATH_COSF_H
173+
174+
#endif // LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
//===-- Compute sin + cos for small angles ----------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H
10+
#define LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H
11+
12+
#include "src/__support/FPUtil/FEnvImpl.h"
13+
#include "src/__support/FPUtil/FPBits.h"
14+
#include "src/__support/FPUtil/double_double.h"
15+
#include "src/__support/FPUtil/multiply_add.h"
16+
#include "src/__support/FPUtil/nearest_integer.h"
17+
#include "src/__support/macros/config.h"
18+
19+
namespace LIBC_NAMESPACE_DECL {
20+
21+
namespace math {
22+
23+
namespace sincosf_float_eval {
24+
25+
// Since the worst case of `x mod pi` in single precision is > 2^-28, in order
26+
// to be bounded by 1 ULP, the range reduction accuracy will need to be at
27+
// least 2^(-28 - 23) = 2^-51.
28+
// For fast small range reduction, we will compute as follow:
29+
// Let pi ~ c0 + c1 + c2
30+
// with |c1| < ulp(c0)/2 and |c2| < ulp(c1)/2
31+
// then:
32+
// k := nearest_int(x * 1/pi);
33+
// u = (x - k * c0) - k * c1 - k * c2
34+
// We requires k * c0, k * c1 to be exactly representable in single precision.
35+
// Let p_k be the precision of k, then the precision of c0 and c1 are:
36+
// 24 - p_k,
37+
// and the ulp of (k * c2) is 2^(-3 * (24 - p_k)).
38+
// This give us the following bound on the precision of k:
39+
// 3 * (24 - p_k) >= 51,
40+
// or equivalently:
41+
// p_k <= 7.
42+
// We set the bound for p_k to be 6 so that we can have some more wiggle room
43+
// for computations.
44+
LIBC_INLINE static unsigned sincosf_range_reduction_small(float x, float &u) {
45+
// > display=hexadecimal;
46+
// > a = round(pi/8, 18, RN);
47+
// > b = round(pi/8 - a, 18, RN);
48+
// > c = round(pi/8 - a - b, SG, RN);
49+
// > round(8/pi, SG, RN);
50+
constexpr float MPI[3] = {-0x1.921f8p-2f, -0x1.aa22p-21f, -0x1.68c234p-41f};
51+
constexpr float ONE_OVER_PI = 0x1.45f306p+1f;
52+
float prod_hi = x * ONE_OVER_PI;
53+
float k = fputil::nearest_integer(prod_hi);
54+
55+
float y_hi = fputil::multiply_add(k, MPI[0], x); // Exact
56+
u = fputil::multiply_add(k, MPI[1], y_hi);
57+
u = fputil::multiply_add(k, MPI[2], u);
58+
return static_cast<unsigned>(static_cast<int>(k));
59+
}
60+
61+
// TODO: Add non-FMA version of large range reduction.
62+
LIBC_INLINE static unsigned sincosf_range_reduction_large(float x, float &u) {
63+
// > for i from 0 to 13 do {
64+
// if i < 2 then { pi_inv = 0.25 + 2^(8*(i - 2)) / pi; }
65+
// else { pi_inv = 2^(8*(i-2)) / pi; };
66+
// pn = nearestint(pi_inv);
67+
// pi_frac = pi_inv - pn;
68+
// a = round(pi_frac, SG, RN);
69+
// b = round(pi_frac - a, SG, RN);
70+
// c = round(pi_frac - a - b, SG, RN);
71+
// d = round(pi_frac - a - b - c, SG, RN);
72+
// print("{", 2^3 * a, ",", 2^3 * b, ",", 2^3 * c, ",", 2^3 * d, "},");
73+
// };
74+
constexpr float EIGHT_OVER_PI[14][4] = {
75+
{0x1.000146p1f, -0x1.9f246cp-28f, -0x1.bbead6p-54f, -0x1.ec5418p-85f},
76+
{0x1.0145f4p1f, -0x1.f246c6p-24f, -0x1.df56bp-49f, -0x1.ec5418p-77f},
77+
{0x1.45f306p1f, 0x1.b9391p-24f, 0x1.529fc2p-50f, 0x1.d5f47ep-76f},
78+
{0x1.f306dcp1f, 0x1.391054p-24f, 0x1.4fe13ap-49f, 0x1.7d1f54p-74f},
79+
{-0x1.f246c6p0f, -0x1.df56bp-25f, -0x1.ec5418p-53f, 0x1.f534dep-78f},
80+
{-0x1.236378p1f, 0x1.529fc2p-26f, 0x1.d5f47ep-52f, -0x1.65912p-77f},
81+
{0x1.391054p0f, 0x1.4fe13ap-25f, 0x1.7d1f54p-50f, -0x1.6447e4p-75f},
82+
{0x1.1054a8p0f, -0x1.ec5418p-29f, 0x1.f534dep-54f, -0x1.f924ecp-81f},
83+
{0x1.529fc2p-2f, 0x1.d5f47ep-28f, -0x1.65912p-53f, 0x1.b6c52cp-79f},
84+
{-0x1.ac07b2p1f, 0x1.5f47d4p-24f, 0x1.a6ee06p-49f, 0x1.b6295ap-74f},
85+
{-0x1.ec5418p-5f, 0x1.f534dep-30f, -0x1.f924ecp-57f, 0x1.5993c4p-82f},
86+
{0x1.3abe9p-1f, -0x1.596448p-27f, 0x1.b6c52cp-55f, -0x1.9b0ef2p-80f},
87+
{-0x1.505c16p1f, 0x1.a6ee06p-25f, 0x1.b6295ap-50f, -0x1.b0ef1cp-76f},
88+
{-0x1.70565ap-1f, 0x1.dc0db6p-26f, 0x1.4acc9ep-53f, 0x1.0e4108p-80f},
89+
};
90+
91+
using FPBits = typename fputil::FPBits<float>;
92+
using fputil::FloatFloat;
93+
FPBits xbits(x);
94+
95+
int x_e_m32 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 32);
96+
unsigned idx = static_cast<unsigned>((x_e_m32 >> 3) + 2);
97+
// Scale x down by 2^(-(8 * (idx - 2))
98+
xbits.set_biased_exponent((x_e_m32 & 7) + FPBits::EXP_BIAS + 32);
99+
// 2^32 <= |x_reduced| < 2^(32 + 8) = 2^40
100+
float x_reduced = xbits.get_val();
101+
// x * c_hi = ph.hi + ph.lo exactly.
102+
FloatFloat ph = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][0]);
103+
// x * c_mid = pm.hi + pm.lo exactly.
104+
FloatFloat pm = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][1]);
105+
// x * c_lo = pl.hi + pl.lo exactly.
106+
FloatFloat pl = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][2]);
107+
// Extract integral parts and fractional parts of (ph.lo + pm.hi).
108+
float sum_hi = ph.lo + pm.hi;
109+
float k = fputil::nearest_integer(sum_hi);
110+
111+
// x * 8/pi mod 1 ~ y_hi + y_mid + y_lo
112+
float y_hi = (ph.lo - k) + pm.hi; // Exact
113+
FloatFloat y_mid = fputil::exact_add(pm.lo, pl.hi);
114+
float y_lo = pl.lo;
115+
116+
// y_l = x * c_lo_2 + pl.lo
117+
float y_l = fputil::multiply_add(x_reduced, EIGHT_OVER_PI[idx][3], y_lo);
118+
FloatFloat y = fputil::exact_add(y_hi, y_mid.hi);
119+
y.lo += (y_mid.lo + y_l);
120+
121+
// Digits of pi/8, generated by Sollya with:
122+
// > a = round(pi/8, SG, RN);
123+
// > b = round(pi/8 - SG, D, RN);
124+
constexpr FloatFloat PI_OVER_8 = {-0x1.777a5cp-27f, 0x1.921fb6p-2f};
125+
126+
// Error bound: with {a} denote the fractional part of a, i.e.:
127+
// {a} = a - round(a)
128+
// Then,
129+
// | {x * 8/pi} - (y_hi + y_lo) | <= ulp(ulp(y_hi)) <= 2^-47
130+
// | {x mod pi/8} - (u.hi + u.lo) | < 2 * 2^-5 * 2^-47 = 2^-51
131+
u = fputil::multiply_add(y.hi, PI_OVER_8.hi, y.lo * PI_OVER_8.hi);
132+
133+
return static_cast<unsigned>(static_cast<int>(k));
134+
}
135+
136+
template <bool IS_SIN> LIBC_INLINE static float sincosf_eval(float x) {
137+
// sin(k * pi/8) for k = 0..15, generated by Sollya with:
138+
// > for k from 0 to 16 do {
139+
// print(round(sin(k * pi/8), SG, RN));
140+
// };
141+
constexpr float SIN_K_PI_OVER_8[16] = {
142+
0.0f, 0x1.87de2ap-2f, 0x1.6a09e6p-1f, 0x1.d906bcp-1f,
143+
1.0f, 0x1.d906bcp-1f, 0x1.6a09e6p-1f, 0x1.87de2ap-2f,
144+
0.0f, -0x1.87de2ap-2f, -0x1.6a09e6p-1f, -0x1.d906bcp-1f,
145+
-1.0f, -0x1.d906bcp-1f, -0x1.6a09e6p-1f, -0x1.87de2ap-2f,
146+
};
147+
148+
using FPBits = fputil::FPBits<float>;
149+
FPBits xbits(x);
150+
uint32_t x_abs = cpp::bit_cast<uint32_t>(x) & 0x7fff'ffffU;
151+
152+
float y;
153+
unsigned k = 0;
154+
if (x_abs < 0x4880'0000U) {
155+
k = sincosf_range_reduction_small(x, y);
156+
} else {
157+
158+
if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
159+
if (xbits.is_signaling_nan()) {
160+
fputil::raise_except_if_required(FE_INVALID);
161+
return FPBits::quiet_nan().get_val();
162+
}
163+
164+
if (x_abs == 0x7f80'0000U) {
165+
fputil::set_errno_if_required(EDOM);
166+
fputil::raise_except_if_required(FE_INVALID);
167+
}
168+
return x + FPBits::quiet_nan().get_val();
169+
}
170+
171+
k = sincosf_range_reduction_large(x, y);
172+
}
173+
174+
float sin_k = SIN_K_PI_OVER_8[k & 15];
175+
// cos(k * pi/8) = sin(k * pi/8 + pi/2) = sin((k + 4) * pi/8).
176+
// cos_k = cos(k * pi/8)
177+
float cos_k = SIN_K_PI_OVER_8[(k + 4) & 15];
178+
179+
float y_sq = y * y;
180+
181+
// Polynomial approximation of sin(y) and cos(y) for |y| <= pi/16:
182+
//
183+
// Using Taylor polynomial for sin(y):
184+
// sin(y) ~ y - y^3 / 6 + y^5 / 120
185+
// Using minimax polynomial generated by Sollya for cos(y) with:
186+
// > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]);
187+
//
188+
// Error bounds:
189+
// * For sin(y)
190+
// > P = x - SG(1/6)*x^3 + SG(1/120) * x^5;
191+
// > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]);
192+
// 0x1.825...p-27
193+
// * For cos(y)
194+
// > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]);
195+
// > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]);
196+
// 0x1.aa8...p-29
197+
198+
// p1 = y^2 * 1/120 - 1/6
199+
float p1 = fputil::multiply_add(y_sq, 0x1.111112p-7f, -0x1.555556p-3f);
200+
// q1 = y^2 * coeff(Q, 4) + coeff(Q, 2)
201+
float q1 = fputil::multiply_add(y_sq, 0x1.54b8bep-5f, -0x1.ffffc4p-2f);
202+
float y3 = y_sq * y;
203+
// c1 ~ cos(y)
204+
float c1 = fputil::multiply_add(y_sq, q1, 1.0f);
205+
// s1 ~ sin(y)
206+
float s1 = fputil::multiply_add(y3, p1, y);
207+
208+
if constexpr (IS_SIN) {
209+
// sin(x) = cos(k * pi/8) * sin(y) + sin(k * pi/8) * cos(y).
210+
return fputil::multiply_add(cos_k, s1, sin_k * c1);
211+
} else {
212+
// cos(x) = cos(k * pi/8) * cos(y) - sin(k * pi/8) * sin(y).
213+
return fputil::multiply_add(cos_k, c1, -sin_k * s1);
214+
}
215+
}
216+
217+
} // namespace sincosf_float_eval
218+
219+
} // namespace math
220+
221+
} // namespace LIBC_NAMESPACE_DECL
222+
223+
#endif // LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H

libc/src/math/generic/sinf.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,30 @@
1717
#include "src/__support/macros/config.h"
1818
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
1919
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
20+
21+
#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) && \
22+
defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) && \
23+
defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
24+
25+
#include "src/__support/math/sincosf_float_eval.h"
26+
27+
namespace LIBC_NAMESPACE_DECL {
28+
29+
LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
30+
return math::sincosf_float_eval::sincosf_eval</*IS_SIN*/ true>(x);
31+
}
32+
33+
} // namespace LIBC_NAMESPACE_DECL
34+
35+
#else // !LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
36+
2037
#include "src/__support/math/sincosf_utils.h"
2138

22-
#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
39+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
2340
#include "src/__support/math/range_reduction_fma.h"
24-
#else
41+
#else // !LIBC_TARGET_CPU_HAS_FMA_DOUBLE
2542
#include "src/__support/math/range_reduction.h"
26-
#endif
43+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
2744

2845
namespace LIBC_NAMESPACE_DECL {
2946

@@ -162,3 +179,4 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
162179
}
163180

164181
} // namespace LIBC_NAMESPACE_DECL
182+
#endif // LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT

0 commit comments

Comments
 (0)