Skip to content

Commit b9d87d7

Browse files
committed
[libc] Improve the performance of exp2f.
Reduce the range-reduction table size from 128 entries down to 64 entries, and reduce the polynomial's degree from 6 down to 4. Currently we use a degree-6 minimax polynomial on an interval of length 2^-7 around 0 to compute exp2f. Based on the suggestion of @santoshn and the RLIBM project (https://github.com/rutgers-apl/rlibm-prog/blob/main/libm/float/exp2.c) it is possible to have a good polynomial of degree-4 on a subinterval of length 2^(-6) to approximate 2^x. We did try to either reduce the degree of the polynomial down to 3 or increase the interval size to 2^(-5), but in both cases the number of exceptional values exploded. So we settle with using a degree-4 polynomial of the interval of size 2^(-6) around 0. Reviewed By: michaelrj, sivachandra, zimmermann6, santoshn Differential Revision: https://reviews.llvm.org/D122346
1 parent e5a7d27 commit b9d87d7

File tree

2 files changed

+129
-150
lines changed

2 files changed

+129
-150
lines changed

libc/src/math/generic/exp2f.cpp

Lines changed: 106 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -18,148 +18,149 @@
1818

1919
namespace __llvm_libc {
2020

21-
// Lookup table for 2^(m * 2^(-7)) with m = 0, ..., 127.
21+
// Lookup table for 2^(m * 2^(-6)) with m = 0, ..., 63.
2222
// Table is generated with Sollya as follow:
2323
// > display = hexadecimal;
24-
// > for i from 0 to 127 do { D(2^(i / 128)); };
25-
static constexpr double EXP_M[128] = {
26-
0x1.0000000000000p0, 0x1.0163da9fb3335p0, 0x1.02c9a3e778061p0,
27-
0x1.04315e86e7f85p0, 0x1.059b0d3158574p0, 0x1.0706b29ddf6dep0,
28-
0x1.0874518759bc8p0, 0x1.09e3ecac6f383p0, 0x1.0b5586cf9890fp0,
29-
0x1.0cc922b7247f7p0, 0x1.0e3ec32d3d1a2p0, 0x1.0fb66affed31bp0,
30-
0x1.11301d0125b51p0, 0x1.12abdc06c31ccp0, 0x1.1429aaea92de0p0,
31-
0x1.15a98c8a58e51p0, 0x1.172b83c7d517bp0, 0x1.18af9388c8deap0,
32-
0x1.1a35beb6fcb75p0, 0x1.1bbe084045cd4p0, 0x1.1d4873168b9aap0,
33-
0x1.1ed5022fcd91dp0, 0x1.2063b88628cd6p0, 0x1.21f49917ddc96p0,
34-
0x1.2387a6e756238p0, 0x1.251ce4fb2a63fp0, 0x1.26b4565e27cddp0,
35-
0x1.284dfe1f56381p0, 0x1.29e9df51fdee1p0, 0x1.2b87fd0dad990p0,
36-
0x1.2d285a6e4030bp0, 0x1.2ecafa93e2f56p0, 0x1.306fe0a31b715p0,
37-
0x1.32170fc4cd831p0, 0x1.33c08b26416ffp0, 0x1.356c55f929ff1p0,
38-
0x1.371a7373aa9cbp0, 0x1.38cae6d05d866p0, 0x1.3a7db34e59ff7p0,
39-
0x1.3c32dc313a8e5p0, 0x1.3dea64c123422p0, 0x1.3fa4504ac801cp0,
40-
0x1.4160a21f72e2ap0, 0x1.431f5d950a897p0, 0x1.44e086061892dp0,
41-
0x1.46a41ed1d0057p0, 0x1.486a2b5c13cd0p0, 0x1.4a32af0d7d3dep0,
42-
0x1.4bfdad5362a27p0, 0x1.4dcb299fddd0dp0, 0x1.4f9b2769d2ca7p0,
43-
0x1.516daa2cf6642p0, 0x1.5342b569d4f82p0, 0x1.551a4ca5d920fp0,
44-
0x1.56f4736b527dap0, 0x1.58d12d497c7fdp0, 0x1.5ab07dd485429p0,
45-
0x1.5c9268a5946b7p0, 0x1.5e76f15ad2148p0, 0x1.605e1b976dc09p0,
46-
0x1.6247eb03a5585p0, 0x1.6434634ccc320p0, 0x1.6623882552225p0,
47-
0x1.68155d44ca973p0, 0x1.6a09e667f3bcdp0, 0x1.6c012750bdabfp0,
48-
0x1.6dfb23c651a2fp0, 0x1.6ff7df9519484p0, 0x1.71f75e8ec5f74p0,
49-
0x1.73f9a48a58174p0, 0x1.75feb564267c9p0, 0x1.780694fde5d3fp0,
50-
0x1.7a11473eb0187p0, 0x1.7c1ed0130c132p0, 0x1.7e2f336cf4e62p0,
51-
0x1.80427543e1a12p0, 0x1.82589994cce13p0, 0x1.8471a4623c7adp0,
52-
0x1.868d99b4492edp0, 0x1.88ac7d98a6699p0, 0x1.8ace5422aa0dbp0,
53-
0x1.8cf3216b5448cp0, 0x1.8f1ae99157736p0, 0x1.9145b0b91ffc6p0,
54-
0x1.93737b0cdc5e5p0, 0x1.95a44cbc8520fp0, 0x1.97d829fde4e50p0,
55-
0x1.9a0f170ca07bap0, 0x1.9c49182a3f090p0, 0x1.9e86319e32323p0,
56-
0x1.a0c667b5de565p0, 0x1.a309bec4a2d33p0, 0x1.a5503b23e255dp0,
57-
0x1.a799e1330b358p0, 0x1.a9e6b5579fdbfp0, 0x1.ac36bbfd3f37ap0,
58-
0x1.ae89f995ad3adp0, 0x1.b0e07298db666p0, 0x1.b33a2b84f15fbp0,
59-
0x1.b59728de5593ap0, 0x1.b7f76f2fb5e47p0, 0x1.ba5b030a1064ap0,
60-
0x1.bcc1e904bc1d2p0, 0x1.bf2c25bd71e09p0, 0x1.c199bdd85529cp0,
61-
0x1.c40ab5fffd07ap0, 0x1.c67f12e57d14bp0, 0x1.c8f6d9406e7b5p0,
62-
0x1.cb720dcef9069p0, 0x1.cdf0b555dc3fap0, 0x1.d072d4a07897cp0,
63-
0x1.d2f87080d89f2p0, 0x1.d5818dcfba487p0, 0x1.d80e316c98398p0,
64-
0x1.da9e603db3285p0, 0x1.dd321f301b460p0, 0x1.dfc97337b9b5fp0,
65-
0x1.e264614f5a129p0, 0x1.e502ee78b3ff6p0, 0x1.e7a51fbc74c83p0,
66-
0x1.ea4afa2a490dap0, 0x1.ecf482d8e67f1p0, 0x1.efa1bee615a27p0,
67-
0x1.f252b376bba97p0, 0x1.f50765b6e4540p0, 0x1.f7bfdad9cbe14p0,
68-
0x1.fa7c1819e90d8p0, 0x1.fd3c22b8f71f1p0,
24+
// > for i from 0 to 63 do { D(2^(i / 64)); };
25+
static constexpr double EXP_M[64] = {
26+
0x1.0000000000000p0, 0x1.02c9a3e778061p0, 0x1.059b0d3158574p0,
27+
0x1.0874518759bc8p0, 0x1.0b5586cf9890fp0, 0x1.0e3ec32d3d1a2p0,
28+
0x1.11301d0125b51p0, 0x1.1429aaea92de0p0, 0x1.172b83c7d517bp0,
29+
0x1.1a35beb6fcb75p0, 0x1.1d4873168b9aap0, 0x1.2063b88628cd6p0,
30+
0x1.2387a6e756238p0, 0x1.26b4565e27cddp0, 0x1.29e9df51fdee1p0,
31+
0x1.2d285a6e4030bp0, 0x1.306fe0a31b715p0, 0x1.33c08b26416ffp0,
32+
0x1.371a7373aa9cbp0, 0x1.3a7db34e59ff7p0, 0x1.3dea64c123422p0,
33+
0x1.4160a21f72e2ap0, 0x1.44e086061892dp0, 0x1.486a2b5c13cd0p0,
34+
0x1.4bfdad5362a27p0, 0x1.4f9b2769d2ca7p0, 0x1.5342b569d4f82p0,
35+
0x1.56f4736b527dap0, 0x1.5ab07dd485429p0, 0x1.5e76f15ad2148p0,
36+
0x1.6247eb03a5585p0, 0x1.6623882552225p0, 0x1.6a09e667f3bcdp0,
37+
0x1.6dfb23c651a2fp0, 0x1.71f75e8ec5f74p0, 0x1.75feb564267c9p0,
38+
0x1.7a11473eb0187p0, 0x1.7e2f336cf4e62p0, 0x1.82589994cce13p0,
39+
0x1.868d99b4492edp0, 0x1.8ace5422aa0dbp0, 0x1.8f1ae99157736p0,
40+
0x1.93737b0cdc5e5p0, 0x1.97d829fde4e50p0, 0x1.9c49182a3f090p0,
41+
0x1.a0c667b5de565p0, 0x1.a5503b23e255dp0, 0x1.a9e6b5579fdbfp0,
42+
0x1.ae89f995ad3adp0, 0x1.b33a2b84f15fbp0, 0x1.b7f76f2fb5e47p0,
43+
0x1.bcc1e904bc1d2p0, 0x1.c199bdd85529cp0, 0x1.c67f12e57d14bp0,
44+
0x1.cb720dcef9069p0, 0x1.d072d4a07897cp0, 0x1.d5818dcfba487p0,
45+
0x1.da9e603db3285p0, 0x1.dfc97337b9b5fp0, 0x1.e502ee78b3ff6p0,
46+
0x1.ea4afa2a490dap0, 0x1.efa1bee615a27p0, 0x1.f50765b6e4540p0,
47+
0x1.fa7c1819e90d8p0,
6948
};
7049

7150
INLINE_FMA
7251
LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
7352
using FPBits = typename fputil::FPBits<float>;
7453
FPBits xbits(x);
7554

76-
// When x =< -150 or nan
77-
if (unlikely(xbits.uintval() >= 0xc316'0000U)) {
78-
// exp(-Inf) = 0
79-
if (xbits.is_inf())
80-
return 0.0f;
81-
// exp(nan) = nan
82-
if (xbits.is_nan())
83-
return x;
84-
if (fputil::get_round() == FE_UPWARD)
85-
return static_cast<float>(FPBits(FPBits::MIN_SUBNORMAL));
86-
if (x != 0.0f)
87-
errno = ERANGE;
88-
return 0.0f;
89-
}
90-
// x >= 128 or nan
91-
if (unlikely(!xbits.get_sign() && (xbits.uintval() >= 0x4300'0000U))) {
92-
if (xbits.uintval() < 0x7f80'0000U) {
93-
int rounding = fputil::get_round();
94-
if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
95-
return static_cast<float>(FPBits(FPBits::MAX_NORMAL));
55+
uint32_t x_u = xbits.uintval();
56+
uint32_t x_abs = x_u & 0x7fff'ffffU;
9657

97-
errno = ERANGE;
98-
}
99-
return x + static_cast<float>(FPBits::inf());
100-
}
101-
// |x| < 2^-25
102-
if (unlikely(xbits.get_unbiased_exponent() <= 101)) {
103-
return 1.0f + x;
104-
}
10558
// Exceptional values.
106-
switch (xbits.uintval()) {
59+
switch (x_u) {
10760
case 0x3b42'9d37U: // x = 0x1.853a6ep-9f
10861
if (fputil::get_round() == FE_TONEAREST)
10962
return 0x1.00870ap+0f;
11063
break;
64+
case 0x3c02'a9adU: // x = 0x1.05535ap-7f
65+
if (fputil::get_round() == FE_TONEAREST)
66+
return 0x1.016b46p+0f;
67+
break;
68+
case 0x3ca6'6e26U: { // x = 0x1.4cdc4cp-6f
69+
int round_mode = fputil::get_round();
70+
if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
71+
return 0x1.03a16ap+0f;
72+
return 0x1.03a168p+0f;
73+
}
74+
case 0x3d92'a282U: // x = 0x1.254504p-4f
75+
if (fputil::get_round() == FE_UPWARD)
76+
return 0x1.0d0688p+0f;
77+
return 0x1.0d0686p+0f;
11178
case 0xbcf3'a937U: // x = -0x1.e7526ep-6f
11279
if (fputil::get_round() == FE_TONEAREST)
11380
return 0x1.f58d62p-1f;
11481
break;
82+
case 0xb8d3'd026U: // x = -0x1.a7a04cp-14f
83+
if (fputil::get_round() == FE_TONEAREST)
84+
return 0x1.fff6d2p-1f;
85+
break;
11586
}
11687

88+
// // When |x| >= 128, |x| < 2^-25, or x is nan
89+
if (unlikely(x_abs >= 0x4300'0000U || x_abs <= 0x3280'0000U)) {
90+
// |x| < 2^-25
91+
if (x_abs <= 0x3280'0000U) {
92+
return 1.0f + x;
93+
}
94+
// x >= 128
95+
if (!xbits.get_sign()) {
96+
// x is finite
97+
if (x_u < 0x7f80'0000U) {
98+
int rounding = fputil::get_round();
99+
if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
100+
return static_cast<float>(FPBits(FPBits::MAX_NORMAL));
101+
102+
errno = ERANGE;
103+
}
104+
// x is +inf or nan
105+
return x + static_cast<float>(FPBits::inf());
106+
}
107+
// x < -150
108+
if (x_u >= 0xc316'0000U) {
109+
// exp(-Inf) = 0
110+
if (xbits.is_inf())
111+
return 0.0f;
112+
// exp(nan) = nan
113+
if (xbits.is_nan())
114+
return x;
115+
if (fputil::get_round() == FE_UPWARD)
116+
return static_cast<float>(FPBits(FPBits::MIN_SUBNORMAL));
117+
if (x != 0.0f)
118+
errno = ERANGE;
119+
return 0.0f;
120+
}
121+
}
117122
// For -150 <= x < 128, to compute 2^x, we perform the following range
118123
// reduction: find hi, mid, lo such that:
119124
// x = hi + mid + lo, in which
120125
// hi is an integer,
121-
// mid * 2^7 is an integer
122-
// -2^(-8) <= lo < 2^-8.
126+
// mid * 2^6 is an integer
127+
// -2^(-7) <= lo < 2^-7.
123128
// In particular,
124-
// hi + mid = round(x * 2^7) * 2^(-7).
129+
// hi + mid = round(x * 2^6) * 2^(-6).
125130
// Then,
126131
// 2^(x) = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
127132
// Multiply by 2^hi is simply adding hi to the exponent field. We store
128-
// exp(mid) in the lookup tables EXP_M. exp(lo) is computed using a degree-7
133+
// exp(mid) in the lookup tables EXP_M. exp(lo) is computed using a degree-4
129134
// minimax polynomial generated by Sollya.
130135

131-
// x_hi = hi + mid.
132-
int x_hi = static_cast<int>(x * 0x1.0p7f);
133-
// Subtract (hi + mid) from x to get lo.
134-
x -= static_cast<float>(x_hi) * 0x1.0p-7f;
135-
double xd = static_cast<double>(x);
136-
// Make sure that -2^(-8) <= lo < 2^-8.
137-
if (x >= 0x1.0p-8f) {
138-
++x_hi;
139-
xd -= 0x1.0p-7;
140-
}
141-
if (x < -0x1.0p-8f) {
142-
--x_hi;
143-
xd += 0x1.0p-7;
144-
}
136+
// x_hi = round(hi + mid).
137+
// The default rounding mode for float-to-int conversion in C++ is
138+
// round-toward-zero. To make it round-to-nearest, we add (-1)^sign(x) * 0.5
139+
// before conversion.
140+
int x_hi =
141+
static_cast<int>(x * 0x1.0p+6f + (xbits.get_sign() ? -0.5f : 0.5f));
145142
// For 2-complement integers, arithmetic right shift is the same as dividing
146143
// by a power of 2 and then round down (toward negative infinity).
147-
int hi = x_hi >> 7;
148-
// mid = x_hi & 0x0000'007fU;
149-
double exp_mid = EXP_M[x_hi & 0x7f];
150-
// Degree-6 minimax polynomial generated by Sollya with the following
144+
int e_hi = (x_hi >> 6) +
145+
static_cast<int>(fputil::FloatProperties<double>::EXPONENT_BIAS);
146+
fputil::FPBits<double> exp_hi(
147+
static_cast<uint64_t>(e_hi)
148+
<< fputil::FloatProperties<double>::MANTISSA_WIDTH);
149+
// mid = x_hi & 0x0000'003fU;
150+
double exp_hi_mid = static_cast<double>(exp_hi) * EXP_M[x_hi & 0x3f];
151+
// Subtract (hi + mid) from x to get lo.
152+
x -= static_cast<float>(x_hi) * 0x1.0p-6f;
153+
double xd = static_cast<double>(x);
154+
// Degree-4 minimax polynomial generated by Sollya with the following
151155
// commands:
152156
// > display = hexadecimal;
153-
// > Q = fpminimax((2^x - 1)/x, 5, [|D...|], [-2^-8, 2^-8]);
157+
// > Q = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-7, 2^-7]);
154158
// > Q;
155159
double exp_lo =
156-
fputil::polyeval(xd, 0x1p0, 0x1.62e42fefa39efp-1, 0x1.ebfbdff82c58ep-3,
157-
0x1.c6b08d711fe2fp-5, 0x1.3b2ab6fe3deb5p-7,
158-
0x1.5d72a05f45c04p-10, 0x1.4284d40c33326p-13);
159-
fputil::FPBits<double> result(exp_mid * exp_lo);
160-
result.set_unbiased_exponent(static_cast<uint16_t>(
161-
static_cast<int>(result.get_unbiased_exponent()) + hi));
162-
return static_cast<float>(static_cast<double>(result));
160+
fputil::polyeval(xd, 0x1p0, 0x1.62e42fefa2417p-1, 0x1.ebfbdff82f809p-3,
161+
0x1.c6b0b92131c47p-5, 0x1.3b2ab6fb568a3p-7);
162+
double result = exp_hi_mid * exp_lo;
163+
return static_cast<float>(result);
163164
}
164165

165166
} // namespace __llvm_libc

libc/test/src/math/exp2f_test.cpp

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -51,51 +51,29 @@ TEST(LlvmLibcExp2fTest, Overflow) {
5151
EXPECT_MATH_ERRNO(ERANGE);
5252
}
5353

54-
// Test with inputs which are the borders of underflow/overflow but still
55-
// produce valid results without setting errno.
56-
TEST(LlvmLibcExp2fTest, Borderline) {
57-
float x;
58-
59-
errno = 0;
60-
x = float(FPBits(0x42fa0001U));
61-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
62-
__llvm_libc::exp2f(x), 0.5);
63-
EXPECT_MATH_ERRNO(0);
64-
65-
x = float(FPBits(0x42ffffffU));
66-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
67-
__llvm_libc::exp2f(x), 0.5);
68-
EXPECT_MATH_ERRNO(0);
69-
70-
x = float(FPBits(0xc2fa0001U));
71-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
72-
__llvm_libc::exp2f(x), 0.5);
73-
EXPECT_MATH_ERRNO(0);
74-
75-
x = float(FPBits(0xc2fc0000U));
76-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
77-
__llvm_libc::exp2f(x), 0.5);
78-
EXPECT_MATH_ERRNO(0);
79-
80-
x = float(FPBits(0xc2fc0001U));
81-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
82-
__llvm_libc::exp2f(x), 0.5);
83-
EXPECT_MATH_ERRNO(0);
84-
85-
x = float(FPBits(0xc3150000U));
86-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
87-
__llvm_libc::exp2f(x), 0.5);
88-
EXPECT_MATH_ERRNO(0);
89-
90-
x = float(FPBits(0x3b42'9d37U));
91-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
92-
__llvm_libc::exp2f(x), 0.5);
93-
EXPECT_MATH_ERRNO(0);
94-
95-
x = float(FPBits(0xbcf3'a937U));
96-
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
97-
__llvm_libc::exp2f(x), 0.5);
98-
EXPECT_MATH_ERRNO(0);
54+
TEST(LlvmLibcExp2fTest, TrickyInputs) {
55+
constexpr int N = 12;
56+
constexpr uint32_t INPUTS[N] = {
57+
0x3b429d37U, /*0x1.853a6ep-9f*/
58+
0x3c02a9adU, /*0x1.05535ap-7f*/
59+
0x3ca66e26U, /*0x1.4cdc4cp-6f*/
60+
0x3d92a282U, /*0x1.254504p-4f*/
61+
0x42fa0001U, /*0x1.f40002p+6f*/
62+
0x42ffffffU, /*0x1.fffffep+6f*/
63+
0xb8d3d026U, /*-0x1.a7a04cp-14f*/
64+
0xbcf3a937U, /*-0x1.e7526ep-6f*/
65+
0xc2fa0001U, /*-0x1.f40002p+6f*/
66+
0xc2fc0000U, /*-0x1.f8p+6f*/
67+
0xc2fc0001U, /*-0x1.f80002p+6f*/
68+
0xc3150000U, /*-0x1.2ap+7f*/
69+
};
70+
for (int i = 0; i < N; ++i) {
71+
errno = 0;
72+
float x = float(FPBits(INPUTS[i]));
73+
EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
74+
__llvm_libc::exp2f(x), 0.5);
75+
EXPECT_MATH_ERRNO(0);
76+
}
9977
}
10078

10179
TEST(LlvmLibcExp2fTest, Underflow) {

0 commit comments

Comments
 (0)