@@ -20,41 +20,52 @@ namespace fputil {
2020
2121#define DEFAULT_DOUBLE_SPLIT 27
2222
23- using DoubleDouble = LIBC_NAMESPACE::NumberPair<double >;
23+ template <typename T> struct DefaultSplit ;
24+ template <> struct DefaultSplit <float > {
25+ static constexpr size_t VALUE = 12 ;
26+ };
27+ template <> struct DefaultSplit <double > {
28+ static constexpr size_t VALUE = DEFAULT_DOUBLE_SPLIT;
29+ };
30+
31+ using DoubleDouble = NumberPair<double >;
32+ using FloatFloat = NumberPair<float >;
2433
2534// The output of Dekker's FastTwoSum algorithm is correct, i.e.:
2635// r.hi + r.lo = a + b exactly
2736// and |r.lo| < eps(r.lo)
2837// Assumption: |a| >= |b|, or a = 0.
29- template <bool FAST2SUM = true >
30- LIBC_INLINE constexpr DoubleDouble exact_add (double a, double b) {
31- DoubleDouble r{0.0 , 0.0 };
38+ template <bool FAST2SUM = true , typename T = double >
39+ LIBC_INLINE constexpr NumberPair<T> exact_add (T a, T b) {
40+ NumberPair<T> r{0.0 , 0.0 };
3241 if constexpr (FAST2SUM) {
3342 r.hi = a + b;
34- double t = r.hi - a;
43+ T t = r.hi - a;
3544 r.lo = b - t;
3645 } else {
3746 r.hi = a + b;
38- double t1 = r.hi - a;
39- double t2 = r.hi - t1;
40- double t3 = b - t1;
41- double t4 = a - t2;
47+ T t1 = r.hi - a;
48+ T t2 = r.hi - t1;
49+ T t3 = b - t1;
50+ T t4 = a - t2;
4251 r.lo = t3 + t4;
4352 }
4453 return r;
4554}
4655
4756// Assumption: |a.hi| >= |b.hi|
48- LIBC_INLINE constexpr DoubleDouble add (const DoubleDouble &a,
49- const DoubleDouble &b) {
50- DoubleDouble r = exact_add (a.hi , b.hi );
51- double lo = a.lo + b.lo ;
57+ template <typename T>
58+ LIBC_INLINE constexpr NumberPair<T> add (const NumberPair<T> &a,
59+ const NumberPair<T> &b) {
60+ NumberPair<T> r = exact_add (a.hi , b.hi );
61+ T lo = a.lo + b.lo ;
5262 return exact_add (r.hi , r.lo + lo);
5363}
5464
5565// Assumption: |a.hi| >= |b|
56- LIBC_INLINE constexpr DoubleDouble add (const DoubleDouble &a, double b) {
57- DoubleDouble r = exact_add<false >(a.hi , b);
66+ template <typename T>
67+ LIBC_INLINE constexpr NumberPair<T> add (const NumberPair<T> &a, T b) {
68+ NumberPair<T> r = exact_add<false >(a.hi , b);
5869 return exact_add (r.hi , r.lo + a.lo );
5970}
6071
@@ -63,12 +74,12 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
6374// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
6475// Roundings," https://inria.hal.science/hal-04480440.
6576// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
66- template <size_t N = DEFAULT_DOUBLE_SPLIT >
67- LIBC_INLINE constexpr DoubleDouble split (double a) {
68- DoubleDouble r{0.0 , 0.0 };
77+ template <typename T = double , size_t N = DefaultSplit<T>::VALUE >
78+ LIBC_INLINE constexpr NumberPair<T> split (T a) {
79+ NumberPair<T> r{0.0 , 0.0 };
6980 // CN = 2^N.
70- constexpr double CN = static_cast <double >(1 << N);
71- constexpr double C = CN + 1.0 ;
81+ constexpr T CN = static_cast <T >(1 << N);
82+ constexpr T C = CN + 1.0 ;
7283 double t1 = C * a;
7384 double t2 = a - t1;
7485 r.hi = t1 + t2;
@@ -77,16 +88,15 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
7788}
7889
7990// Helper for non-fma exact mult where the first number is already split.
80- template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>
81- LIBC_INLINE DoubleDouble exact_mult (const DoubleDouble &as, double a,
82- double b) {
83- DoubleDouble bs = split<SPLIT_B>(b);
84- DoubleDouble r{0.0 , 0.0 };
91+ template <typename T = double , size_t SPLIT_B = DefaultSplit<T>::VALUE>
92+ LIBC_INLINE NumberPair<T> exact_mult (const NumberPair<T> &as, T a, T b) {
93+ NumberPair<T> bs = split<T, SPLIT_B>(b);
94+ NumberPair<T> r{0.0 , 0.0 };
8595
8696 r.hi = a * b;
87- double t1 = as.hi * bs.hi - r.hi ;
88- double t2 = as.hi * bs.lo + t1;
89- double t3 = as.lo * bs.hi + t2;
97+ T t1 = as.hi * bs.hi - r.hi ;
98+ T t2 = as.hi * bs.lo + t1;
99+ T t3 = as.lo * bs.hi + t2;
90100 r.lo = as.lo * bs.lo + t3;
91101
92102 return r;
@@ -99,18 +109,18 @@ LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
99109// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
100110// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
101111// then a * b = r.hi + r.lo is exact for all rounding modes.
102- template <size_t SPLIT_B = 27 >
103- LIBC_INLINE DoubleDouble exact_mult (double a, double b) {
104- DoubleDouble r{0.0 , 0.0 };
112+ template <typename T = double , size_t SPLIT_B = DefaultSplit<T>::VALUE >
113+ LIBC_INLINE NumberPair<T> exact_mult (T a, T b) {
114+ NumberPair<T> r{0.0 , 0.0 };
105115
106116#ifdef LIBC_TARGET_CPU_HAS_FMA
107117 r.hi = a * b;
108118 r.lo = fputil::multiply_add (a, b, -r.hi );
109119#else
110120 // Dekker's Product.
111- DoubleDouble as = split (a);
121+ NumberPair<T> as = split (a);
112122
113- r = exact_mult<SPLIT_B>(as, a, b);
123+ r = exact_mult<T, SPLIT_B>(as, a, b);
114124#endif // LIBC_TARGET_CPU_HAS_FMA
115125
116126 return r;
@@ -125,7 +135,7 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
125135template <size_t SPLIT_B = 27 >
126136LIBC_INLINE DoubleDouble quick_mult (const DoubleDouble &a,
127137 const DoubleDouble &b) {
128- DoubleDouble r = exact_mult<SPLIT_B>(a.hi , b.hi );
138+ DoubleDouble r = exact_mult<double , SPLIT_B>(a.hi , b.hi );
129139 double t1 = multiply_add (a.hi , b.lo , r.lo );
130140 double t2 = multiply_add (a.lo , b.hi , t1);
131141 r.lo = t2;
@@ -157,19 +167,20 @@ LIBC_INLINE DoubleDouble multiply_add<DoubleDouble>(const DoubleDouble &a,
157167// rl = q * (ah - bh * rh) + q * (al - bl * rh)
158168// as accurate as possible, then the error is bounded by:
159169// |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh)
160- LIBC_INLINE DoubleDouble div (const DoubleDouble &a, const DoubleDouble &b) {
161- DoubleDouble r;
162- double q = 1.0 / b.hi ;
170+ template <typename T>
171+ LIBC_INLINE NumberPair<T> div (const NumberPair<T> &a, const NumberPair<T> &b) {
172+ NumberPair<T> r;
173+ T q = T (1 ) / b.hi ;
163174 r.hi = a.hi * q;
164175
165176#ifdef LIBC_TARGET_CPU_HAS_FMA
166- double e_hi = fputil::multiply_add (b.hi , -r.hi , a.hi );
167- double e_lo = fputil::multiply_add (b.lo , -r.hi , a.lo );
177+ T e_hi = fputil::multiply_add (b.hi , -r.hi , a.hi );
178+ T e_lo = fputil::multiply_add (b.lo , -r.hi , a.lo );
168179#else
169- DoubleDouble b_hi_r_hi = fputil::exact_mult (b.hi , -r.hi );
170- DoubleDouble b_lo_r_hi = fputil::exact_mult (b.lo , -r.hi );
171- double e_hi = (a.hi + b_hi_r_hi.hi ) + b_hi_r_hi.lo ;
172- double e_lo = (a.lo + b_lo_r_hi.hi ) + b_lo_r_hi.lo ;
180+ NumberPair<T> b_hi_r_hi = fputil::exact_mult (b.hi , -r.hi );
181+ NumberPair<T> b_lo_r_hi = fputil::exact_mult (b.lo , -r.hi );
182+ T e_hi = (a.hi + b_hi_r_hi.hi ) + b_hi_r_hi.lo ;
183+ T e_lo = (a.lo + b_lo_r_hi.hi ) + b_lo_r_hi.lo ;
173184#endif // LIBC_TARGET_CPU_HAS_FMA
174185
175186 r.lo = q * (e_hi + e_lo);
0 commit comments