Skip to content

Commit 3cee8ce

Browse files
committed
cleaned up erf(Float64)
1 parent 0fc6d4d commit 3cee8ce

File tree

1 file changed

+46
-138
lines changed

1 file changed

+46
-138
lines changed

src/erf.jl

Lines changed: 46 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -97,36 +97,14 @@ See also:
9797
[`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv).
9898
9999
# Implementation by
100-
- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c
100+
- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c with modification
101101
- `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/)
102102
"""
103103

104104
# Fast erf implementation using a mix of
105105
# rational and polynomial approximations.
106106
# Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.
107107
function _erf(x::Float64)
108-
# Minimax approximation of erf
109-
PA=(0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,-0x1.18c47fd143c5ep-23)
110-
# Rational approximation on [0x1p-28, 0.84375]
111-
NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16)
112-
DA=(0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18)
113-
# Rational approximation on [0.84375, 1.25]
114-
NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 )
115-
DB=( 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 )
116-
117-
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25
118-
PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)
119-
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
120-
PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )
121-
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
122-
PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )
123-
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
124-
PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )
125-
126-
C = 0x1.b0ac16p-1
127-
128-
TwoOverSqrtPiMinusOne=0x1.06eba8214db69p-3
129-
130108

131109
# # top 32 bits
132110
ix::UInt32=reinterpret(UInt64,x)>>32
@@ -146,52 +124,40 @@ function _erf(x::Float64)
146124

147125
if (ia < 0x3fe00000)
148126
## a < 0.5 - Use polynomial approximation.
149-
r1 = fma(x2, PA[2], PA[1])
150-
r2 = fma(x2, PA[4], PA[3])
151-
r3 = fma(x2, PA[6], PA[5])
152-
r4 = fma(x2, PA[8], PA[7])
153-
r5 = fma(x2, PA[10], PA[9])
154-
155-
x4 = x2 * x2
156-
r = r5
157-
r = fma(x4, r, r4)
158-
r = fma(x4, r, r3)
159-
r = fma(x4, r, r2)
160-
r = fma(x4, r, r1)
161-
return fma(r, x, x) ## This fma is crucial for accuracy.
127+
128+
# Minimax approximation of erf of the form x*P(x^2) approximately on the interval [0;0.5]
129+
PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7)
130+
131+
r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy.
132+
133+
return r
162134
else
163135
## 0.5 <= a < 0.84375 - Use rational approximation.
164136

165-
r1n = fma(x2, NA[2], NA[1])
166-
x4 = x2 * x2
167-
r2n = fma(x2, NA[4], NA[3])
168-
x8 = x4 * x4
169-
r1d = fma(x2, DA[1], 1.0)
170-
r2d = fma(x2, DA[3], DA[2])
171-
r3d = fma(x2, DA[5], DA[4])
172-
P = r1n + x4 * r2n + x8 * NA[5]
137+
# Rational approximation on [0x1p-28, 0.84375]
138+
NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16)
139+
DA=(1,0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18)
140+
141+
P=evalpoly(x2,NA)
142+
Q=evalpoly(x2,DA)
173143

174-
Q = r1d + x4 * r2d + x8 * r3d
175144
return fma(P / Q, x, x)
176145
end
177146
elseif (ia < 0x3ff40000)
178147
## 0.84375 <= |x| < 1.25.
179148

149+
# Rational approximation on [0.84375, 1.25]
150+
NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 )
151+
DB=( 1, 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 )
152+
153+
C = 0x1.b0ac16p-1
154+
180155
a = abs(x) - 1.0
181-
r1n = fma(a, NB[2], NB[1])
182-
a2 = a * a
183-
r1d = fma(a, DB[1], 1.0)
184-
a4 = a2 * a2
185-
r2n = fma(a, NB[4], NB[3])
186-
a6 = a4 * a2
187-
r2d = fma(a, DB[3], DB[2])
188-
r3n = fma(a, NB[6], NB[5])
189-
r3d = fma(a, DB[5], DB[4])
190-
r4n = NB[7]
191-
r4d = DB[6]
192-
193-
P = r1n + a2 * r2n + a4 * r3n + a6 * r4n
194-
Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d
156+
157+
P=evalpoly(a,NB)
158+
Q=evalpoly(a,DB)
159+
160+
195161
if (sign)
196162
return -C - P / Q
197163
else
@@ -201,27 +167,13 @@ function _erf(x::Float64)
201167
## 1.25 <= |x| < 2.0.
202168
a = abs(x)
203169
a = a - 1.25
170+
171+
# erfc polynomial approximation
172+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25
173+
PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)
204174

205-
r1 = fma(a, PC[2], PC[1])
206-
r2 = fma(a, PC[4], PC[3])
207-
r3 = fma(a, PC[6], PC[5])
208-
r4 = fma(a, PC[8], PC[7])
209-
r5 = fma(a, PC[10], PC[9])
210-
r6 = fma(a, PC[12], PC[11])
211-
r7 = fma(a, PC[14], PC[13])
212-
r8 = fma(a, PC[16], PC[15])
213-
214-
215-
a2 = a * a
216-
217-
r = r8
218-
r = fma(a2, r, r7)
219-
r = fma(a2, r, r6)
220-
r = fma(a2, r, r5)
221-
r = fma(a2, r, r4)
222-
r = fma(a2, r, r3)
223-
r = fma(a2, r, r2)
224-
r = fma(a2, r, r1)
175+
# Obtains erfc of |x|
176+
r=evalpoly(a,PC)
225177

226178
if (sign)
227179
return -1.0 + r
@@ -233,27 +185,13 @@ function _erf(x::Float64)
233185
a = abs(x)
234186
a = fma(0.5, a, -1.0)
235187

236-
r1 = fma(a, PD[2], PD[1])
237-
r2 = fma(a, PD[4], PD[3])
238-
r3 = fma(a, PD[6], PD[5])
239-
r4 = fma(a, PD[8], PD[7])
240-
r5 = fma(a, PD[10], PD[9])
241-
r6 = fma(a, PD[12], PD[11])
242-
r7 = fma(a, PD[14], PD[13])
243-
r8 = fma(a, PD[16], PD[15])
244-
r9 = fma(a, PD[18], PD[17])
245-
246-
a2 = a * a
247-
248-
r = r9
249-
r = fma(a2, r, r8)
250-
r = fma(a2, r, r7)
251-
r = fma(a2, r, r6)
252-
r = fma(a2, r, r5)
253-
r = fma(a2, r, r4)
254-
r = fma(a2, r, r3)
255-
r = fma(a2, r, r2)
256-
r = fma(a2, r, r1)
188+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
189+
PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )
190+
191+
192+
# Obtains erfc of |x|
193+
r=evalpoly(a,PD)
194+
257195

258196
if (sign)
259197
return -1.0 + r
@@ -265,24 +203,11 @@ function _erf(x::Float64)
265203
a = abs(x)
266204
a = a - 3.25
267205

268-
r1 = fma(a, PE[2], PE[1])
269-
r2 = fma(a, PE[4], PE[3])
270-
r3 = fma(a, PE[6], PE[5])
271-
r4 = fma(a, PE[8], PE[7])
272-
r5 = fma(a, PE[10], PE[9])
273-
r6 = fma(a, PE[12], PE[11])
274-
r7 = fma(a, PE[14], PE[13])
275-
206+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
207+
PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )
276208

277-
a2 = a * a
209+
r=evalpoly(a,PE)
278210

279-
r = r7
280-
r = fma(a2, r, r6)
281-
r = fma(a2, r, r5)
282-
r = fma(a2, r, r4)
283-
r = fma(a2, r, r3)
284-
r = fma(a2, r, r2)
285-
r = fma(a2, r, r1)
286211

287212
if (sign)
288213
return -1.0 + r
@@ -294,28 +219,11 @@ function _erf(x::Float64)
294219
a = abs(x)
295220
a = fma(0.5, a, -2.0)
296221

297-
r1 = fma(a, PF[2], PF[1])
298-
r2 = fma(a, PF[4], PF[3])
299-
r3 = fma(a, PF[6], PF[5])
300-
r4 = fma(a, PF[8], PF[7])
301-
r5 = fma(a, PF[10], PF[9])
302-
r6 = fma(a, PF[12], PF[11])
303-
r7 = fma(a, PF[14], PF[13])
304-
r8 = fma(a, PF[16], PF[15])
305-
306-
r9 = PF[17]
307-
308-
a2 = a * a
309-
310-
r = r9
311-
r = fma(a2, r, r8)
312-
r = fma(a2, r, r7)
313-
r = fma(a2, r, r6)
314-
r = fma(a2, r, r5)
315-
r = fma(a2, r, r4)
316-
r = fma(a2, r, r3)
317-
r = fma(a2, r, r2)
318-
r = fma(a2, r, r1)
222+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
223+
PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )
224+
225+
r=evalpoly(a,PF)
226+
319227

320228
if (sign)
321229
return -1.0 + r

0 commit comments

Comments
 (0)