Skip to content

Commit 00e5afd

Browse files
blapiensz-arm
authored andcommitted
math: add scalar erf
Only tested in round-to-nearest mode. The expected worst case error is 1.01 ULP near x=1.25. Benchmarked over random x in [-6,6] and can increase performance by > 2x (> 3.5x for throughput) on big ooo cores compared to the implementation in glibc 2.28. Includes data for erfc too, but this patch only adds erf.
1 parent fc3fcc9 commit 00e5afd

File tree

7 files changed

+375
-0
lines changed

7 files changed

+375
-0
lines changed

math/erf.c

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
/*
2+
* Double-precision erf(x) function.
3+
*
4+
* Copyright (c) 2020, Arm Limited.
5+
* SPDX-License-Identifier: MIT
6+
*/
7+
8+
#include "math_config.h"
9+
#include <math.h>
10+
#include <stdint.h>
11+
12+
#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
13+
#define C 0x1.b0ac16p-1
14+
#define PA __erf_data.erf_poly_A
15+
#define NA __erf_data.erf_ratio_N_A
16+
#define DA __erf_data.erf_ratio_D_A
17+
#define NB __erf_data.erf_ratio_N_B
18+
#define DB __erf_data.erf_ratio_D_B
19+
#define PC __erf_data.erfc_poly_C
20+
#define PD __erf_data.erfc_poly_D
21+
#define PE __erf_data.erfc_poly_E
22+
#define PF __erf_data.erfc_poly_F
23+
24+
/* Top 32 bits of a double. */
25+
static inline uint32_t
26+
top32 (double x)
27+
{
28+
return asuint64 (x) >> 32;
29+
}
30+
31+
/* Fast erf implementation using a mix of
32+
rational and polynomial approximations.
33+
Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */
34+
double
35+
erf (double x)
36+
{
37+
/* Get top word and sign. */
38+
uint32_t ix = top32 (x);
39+
uint32_t ia = ix & 0x7fffffff;
40+
uint32_t sign = ix >> 31;
41+
42+
/* Normalized and subnormal cases */
43+
if (ia < 0x3feb0000)
44+
{ /* a = |x| < 0.84375. */
45+
46+
if (ia < 0x3e300000)
47+
{ /* a < 2^(-28). */
48+
if (ia < 0x00800000)
49+
{ /* a < 2^(-1015). */
50+
double y = x + TwoOverSqrtPiMinusOne * x;
51+
return check_uflow (y);
52+
}
53+
return x + TwoOverSqrtPiMinusOne * x;
54+
}
55+
56+
double x2 = x * x;
57+
58+
if (ia < 0x3fe00000)
59+
{ /* a < 0.5 - Use polynomial approximation. */
60+
double r1 = fma (x2, PA[1], PA[0]);
61+
double r2 = fma (x2, PA[3], PA[2]);
62+
double r3 = fma (x2, PA[5], PA[4]);
63+
double r4 = fma (x2, PA[7], PA[6]);
64+
double r5 = fma (x2, PA[9], PA[8]);
65+
double x4 = x2 * x2;
66+
double r = r5;
67+
r = fma (x4, r, r4);
68+
r = fma (x4, r, r3);
69+
r = fma (x4, r, r2);
70+
r = fma (x4, r, r1);
71+
return fma (r, x, x); /* This fma is crucial for accuracy. */
72+
}
73+
else
74+
{ /* 0.5 <= a < 0.84375 - Use rational approximation. */
75+
double x4, x8, r1n, r2n, r1d, r2d, r3d;
76+
77+
r1n = fma (x2, NA[1], NA[0]);
78+
x4 = x2 * x2;
79+
r2n = fma (x2, NA[3], NA[2]);
80+
x8 = x4 * x4;
81+
r1d = fma (x2, DA[0], 1.0);
82+
r2d = fma (x2, DA[2], DA[1]);
83+
r3d = fma (x2, DA[4], DA[3]);
84+
double P = r1n + x4 * r2n + x8 * NA[4];
85+
double Q = r1d + x4 * r2d + x8 * r3d;
86+
return fma (P / Q, x, x);
87+
}
88+
}
89+
else if (ia < 0x3ff40000)
90+
{ /* 0.84375 <= |x| < 1.25. */
91+
double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
92+
double a = fabs (x) - 1.0;
93+
r1n = fma (a, NB[1], NB[0]);
94+
a2 = a * a;
95+
r1d = fma (a, DB[0], 1.0);
96+
a4 = a2 * a2;
97+
r2n = fma (a, NB[3], NB[2]);
98+
a6 = a4 * a2;
99+
r2d = fma (a, DB[2], DB[1]);
100+
r3n = fma (a, NB[5], NB[4]);
101+
r3d = fma (a, DB[4], DB[3]);
102+
r4n = NB[6];
103+
r4d = DB[5];
104+
double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
105+
double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
106+
if (sign)
107+
return -C - P / Q;
108+
else
109+
return C + P / Q;
110+
}
111+
else if (ia < 0x40000000)
112+
{ /* 1.25 <= |x| < 2.0. */
113+
double a = fabs (x);
114+
a = a - 1.25;
115+
116+
double r1 = fma (a, PC[1], PC[0]);
117+
double r2 = fma (a, PC[3], PC[2]);
118+
double r3 = fma (a, PC[5], PC[4]);
119+
double r4 = fma (a, PC[7], PC[6]);
120+
double r5 = fma (a, PC[9], PC[8]);
121+
double r6 = fma (a, PC[11], PC[10]);
122+
double r7 = fma (a, PC[13], PC[12]);
123+
double r8 = fma (a, PC[15], PC[14]);
124+
125+
double a2 = a * a;
126+
127+
double r = r8;
128+
r = fma (a2, r, r7);
129+
r = fma (a2, r, r6);
130+
r = fma (a2, r, r5);
131+
r = fma (a2, r, r4);
132+
r = fma (a2, r, r3);
133+
r = fma (a2, r, r2);
134+
r = fma (a2, r, r1);
135+
136+
if (sign)
137+
return -1.0 + r;
138+
else
139+
return 1.0 - r;
140+
}
141+
else if (ia < 0x400a0000)
142+
{ /* 2 <= |x| < 3.25. */
143+
double a = fabs (x);
144+
a = fma (0.5, a, -1.0);
145+
146+
double r1 = fma (a, PD[1], PD[0]);
147+
double r2 = fma (a, PD[3], PD[2]);
148+
double r3 = fma (a, PD[5], PD[4]);
149+
double r4 = fma (a, PD[7], PD[6]);
150+
double r5 = fma (a, PD[9], PD[8]);
151+
double r6 = fma (a, PD[11], PD[10]);
152+
double r7 = fma (a, PD[13], PD[12]);
153+
double r8 = fma (a, PD[15], PD[14]);
154+
double r9 = fma (a, PD[17], PD[16]);
155+
156+
double a2 = a * a;
157+
158+
double r = r9;
159+
r = fma (a2, r, r8);
160+
r = fma (a2, r, r7);
161+
r = fma (a2, r, r6);
162+
r = fma (a2, r, r5);
163+
r = fma (a2, r, r4);
164+
r = fma (a2, r, r3);
165+
r = fma (a2, r, r2);
166+
r = fma (a2, r, r1);
167+
168+
if (sign)
169+
return -1.0 + r;
170+
else
171+
return 1.0 - r;
172+
}
173+
else if (ia < 0x40100000)
174+
{ /* 3.25 <= |x| < 4.0. */
175+
double a = fabs (x);
176+
a = a - 3.25;
177+
178+
double r1 = fma (a, PE[1], PE[0]);
179+
double r2 = fma (a, PE[3], PE[2]);
180+
double r3 = fma (a, PE[5], PE[4]);
181+
double r4 = fma (a, PE[7], PE[6]);
182+
double r5 = fma (a, PE[9], PE[8]);
183+
double r6 = fma (a, PE[11], PE[10]);
184+
double r7 = fma (a, PE[13], PE[12]);
185+
186+
double a2 = a * a;
187+
188+
double r = r7;
189+
r = fma (a2, r, r6);
190+
r = fma (a2, r, r5);
191+
r = fma (a2, r, r4);
192+
r = fma (a2, r, r3);
193+
r = fma (a2, r, r2);
194+
r = fma (a2, r, r1);
195+
196+
if (sign)
197+
return -1.0 + r;
198+
else
199+
return 1.0 - r;
200+
}
201+
else if (ia < 0x4017a000)
202+
{ /* 4 <= |x| < 5.90625. */
203+
double a = fabs (x);
204+
a = fma (0.5, a, -2.0);
205+
206+
double r1 = fma (a, PF[1], PF[0]);
207+
double r2 = fma (a, PF[3], PF[2]);
208+
double r3 = fma (a, PF[5], PF[4]);
209+
double r4 = fma (a, PF[7], PF[6]);
210+
double r5 = fma (a, PF[9], PF[8]);
211+
double r6 = fma (a, PF[11], PF[10]);
212+
double r7 = fma (a, PF[13], PF[12]);
213+
double r8 = fma (a, PF[15], PF[14]);
214+
double r9 = PF[16];
215+
216+
double a2 = a * a;
217+
218+
double r = r9;
219+
r = fma (a2, r, r8);
220+
r = fma (a2, r, r7);
221+
r = fma (a2, r, r6);
222+
r = fma (a2, r, r5);
223+
r = fma (a2, r, r4);
224+
r = fma (a2, r, r3);
225+
r = fma (a2, r, r2);
226+
r = fma (a2, r, r1);
227+
228+
if (sign)
229+
return -1.0 + r;
230+
else
231+
return 1.0 - r;
232+
}
233+
else
234+
{
235+
/* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
236+
if (unlikely (ia >= 0x7ff00000))
237+
return (double) (1.0 - (sign << 1)) + 1.0 / x;
238+
239+
if (sign)
240+
return -1.0;
241+
else
242+
return 1.0;
243+
}
244+
}

math/erf_data.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Shared data between erf and erfc.
3+
*
4+
* Copyright (c) 2019-2020, Arm Limited.
5+
* SPDX-License-Identifier: MIT
6+
*/
7+
8+
#include "math_config.h"
9+
10+
/*
11+
Minimax approximation of erf
12+
*/
13+
const struct erf_data __erf_data = {
14+
.erf_poly_A = {
15+
#if ERF_POLY_A_NCOEFFS == 10
16+
0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
17+
-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
18+
0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
19+
-0x1.18c47fd143c5ep-23
20+
#endif
21+
},
22+
/* Rational approximation on [0x1p-28, 0.84375] */
23+
.erf_ratio_N_A = {
24+
0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
25+
-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
26+
},
27+
.erf_ratio_D_A = {
28+
0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
29+
0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
30+
},
31+
/* Rational approximation on [0.84375, 1.25] */
32+
.erf_ratio_N_B = {
33+
-0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
34+
0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
35+
-0x1.1bf380a96073fp-9
36+
},
37+
.erf_ratio_D_B = {
38+
0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
39+
0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
40+
},
41+
.erfc_poly_C = {
42+
#if ERFC_POLY_C_NCOEFFS == 16
43+
/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
44+
0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
45+
-0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
46+
-0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
47+
-0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
48+
0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
49+
-0x1.578c9e375d37p-19
50+
#endif
51+
},
52+
.erfc_poly_D = {
53+
#if ERFC_POLY_D_NCOEFFS == 18
54+
/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
55+
0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
56+
-0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
57+
-0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
58+
0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
59+
-0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
60+
0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
61+
#endif
62+
},
63+
.erfc_poly_E = {
64+
#if ERFC_POLY_E_NCOEFFS == 14
65+
/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
66+
0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
67+
-0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
68+
0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
69+
-0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
70+
-0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
71+
#endif
72+
},
73+
.erfc_poly_F = {
74+
#if ERFC_POLY_F_NCOEFFS == 17
75+
/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
76+
0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
77+
-0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
78+
0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
79+
-0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
80+
0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
81+
-0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
82+
#endif
83+
}
84+
};
85+

math/math_config.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,4 +440,23 @@ extern const struct erff_data
440440
float erff_poly_B[7];
441441
} __erff_data HIDDEN;
442442

443+
#define ERF_POLY_A_ORDER 19
444+
#define ERF_POLY_A_NCOEFFS 10
445+
#define ERFC_POLY_C_NCOEFFS 16
446+
#define ERFC_POLY_D_NCOEFFS 18
447+
#define ERFC_POLY_E_NCOEFFS 14
448+
#define ERFC_POLY_F_NCOEFFS 17
449+
extern const struct erf_data
450+
{
451+
double erf_poly_A[ERF_POLY_A_NCOEFFS];
452+
double erf_ratio_N_A[5];
453+
double erf_ratio_D_A[5];
454+
double erf_ratio_N_B[7];
455+
double erf_ratio_D_B[6];
456+
double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
457+
double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
458+
double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
459+
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
460+
} __erf_data HIDDEN;
461+
443462
#endif

math/test/mathbench.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ D (log2, 0.999, 1.001)
248248
{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
249249
D (xpow, 0.01, 11.1)
250250
D (ypow, -9.9, 9.9)
251+
D (erf, -6.0, 6.0)
251252

252253
F (dummyf, 1.0, 2.0)
253254
F (expf, -9.9, 9.9)

math/test/runulp.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
7272
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
7373
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
7474

75+
L=1.0
76+
t erf 0 0xffff000000000000 10000
77+
t erf 0x1p-1022 0x1p-26 40000
78+
t erf -0x1p-1022 -0x1p-26 40000
79+
t erf 0x1p-26 0x1p3 40000
80+
t erf -0x1p-26 -0x1p3 40000
81+
t erf 0 inf 40000
82+
7583
L=0.01
7684
t expf 0 0xffff0000 10000
7785
t expf 0x1p-14 0x1p8 50000
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
; erf.tst - Directed test cases for erf
2+
;
3+
; Copyright (c) 2007-2020, Arm Limited.
4+
; SPDX-License-Identifier: MIT
5+
6+
func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
7+
func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
8+
func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
9+
func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
10+
func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
11+
func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
12+
func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
13+
func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
14+
func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
15+
func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
16+
func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
17+
func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0

0 commit comments

Comments
 (0)