Skip to content

Commit 4afbc6c

Browse files
authored
Merge pull request #88 from JuliaMath/better-sin_sum
better sin_sum
2 parents 6a3eed7 + daf86ce commit 4afbc6c

File tree

3 files changed

+107
-102
lines changed

3 files changed

+107
-102
lines changed

src/besselj.jl

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,11 @@ function _besselj0(x::Float64)
5959
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
6060
p = evalpoly(x2, p2)
6161
q = evalpoly(x2, q2)
62-
if x > 1e15
63-
a = SQ2OPI(T) * sqrt(xinv) * p
64-
xn = muladd(xinv, q, -PIO4(T))
65-
s_x, c_x = sincos(x)
66-
s_xn, c_xn = sincos(xn)
67-
return a * (c_x * c_xn - s_x * s_xn)
68-
end
6962
end
7063

7164
a = SQ2OPI(T) * sqrt(xinv) * p
72-
xn = muladd(xinv, q, -PIO4(T))
73-
74-
# the following computes b = cos(x + xn) more accurately
75-
# see src/misc.jl
76-
b = cos_sum(x, xn)
65+
xn = xinv*q
66+
b = sin_sum(x, pi/4, xn)
7767
return a * b
7868
end
7969
end
@@ -137,21 +127,11 @@ function _besselj1(x::Float64)
137127
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
138128
p = evalpoly(x2, p2)
139129
q = evalpoly(x2, q2)
140-
if x > 1e15
141-
a = SQ2OPI(T) * sqrt(xinv) * p
142-
xn = muladd(xinv, q, -3 * PIO4(T))
143-
s_x, c_x = sincos(x)
144-
s_xn, c_xn = sincos(xn)
145-
return s * a * (c_x * c_xn - s_x * s_xn)
146-
end
147130
end
148131

149132
a = SQ2OPI(T) * sqrt(xinv) * p
150-
xn = muladd(xinv, q, -3 * PIO4(T))
151-
152-
# the following computes b = cos(x + xn) more accurately
153-
# see src/misc.jl
154-
b = cos_sum(x, xn)
133+
xn = xinv*q
134+
b = sin_sum(x, -pi/4, xn)
155135
return a * b * s
156136
end
157137
end
@@ -182,7 +162,7 @@ end
182162
#####
183163

184164
besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im)
185-
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
165+
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
186166

187167

188168
# Bessel functions of the first kind of order nu
@@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
208188
#
209189
# For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up
210190
# to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
211-
#
191+
#
212192
# [1] http://dlmf.nist.gov/10.2.E2
213-
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
193+
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
214194
# Advances in Computational Mathematics 45.1 (2019): 173-211.
215-
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
195+
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
216196
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
217-
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
197+
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
218198
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
219-
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
199+
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
220200
# Computer physics communications 76.3 (1993): 381-388.
221-
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
201+
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
222202
# Vol. 55. US Government printing office, 1964.
223203
#
224204

@@ -432,7 +412,7 @@ end
432412
# Cutoff for Float32 determined from using Float64 precision down to eps(Float32)
433413
besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x))
434414
besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x))
435-
# cutoff for Float128 for ~1e-35 relative error
415+
# cutoff for Float128 for ~1e-35 relative error
436416
#besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x))
437417

438418
#=
@@ -467,7 +447,7 @@ end
467447
# On the other hand, shifting the order down avoids any concern about underflow for large orders
468448
# Shifting the order too high while keeping x fixed could result in numerical underflow
469449
# Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs
470-
# Shifting the order up decreases the value substantially for high orders and results
450+
# Shifting the order up decreases the value substantially for high orders and results
471451
# in a stable forward recurrence as the values rapidly increase
472452
function besselj_recurrence(nu, x)
473453
# shift order up to where expansions are valid see src/U_polynomials.jl

src/bessely.jl

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64)
6464
z = w * w
6565
p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T))
6666
q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T))
67-
xn = x - PIO4(T)
67+
xn = x - pi/4
6868
sc = sincos(xn)
6969
p = p * sc[1] + w * q * sc[2]
7070
return p * SQ2OPI(T) / sqrt(x)
@@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64)
8181
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
8282
p = evalpoly(x2, p2)
8383
q = evalpoly(x2, q2)
84-
if x > 1e15
85-
a = SQ2OPI(T) * sqrt(xinv) * p
86-
xn = muladd(xinv, q, -PIO4(T))
87-
s_x, c_x = sincos(x)
88-
s_xn, c_xn = sincos(xn)
89-
return a * (s_x * c_xn + c_x * s_xn)
90-
end
9184
end
9285

9386
a = SQ2OPI(T) * sqrt(xinv) * p
94-
xn = muladd(xinv, q, -PIO4(T))
95-
96-
# the following computes b = sin(x + xn) more accurately
97-
# see src/misc.jl
98-
b = sin_sum(x, xn)
87+
xn = xinv*q
88+
b = sin_sum(x, -pi/4, xn)
9989
return a * b
10090
end
10191
end
@@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64)
170160
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
171161
p = evalpoly(x2, p2)
172162
q = evalpoly(x2, q2)
173-
if x > 1e15
174-
a = SQ2OPI(T) * sqrt(xinv) * p
175-
xn = muladd(xinv, q, - 3 * PIO4(T))
176-
s_x, c_x = sincos(x)
177-
s_xn, c_xn = sincos(xn)
178-
return a * (s_x * c_xn + c_x * s_xn)
179-
end
180163
end
181164

182165
a = SQ2OPI(T) * sqrt(xinv) * p
183-
xn = muladd(xinv, q, - 3 * PIO4(T))
184-
185-
# the following computes b = sin(x + xn) more accurately
186-
# see src/misc.jl
187-
b = sin_sum(x, xn)
166+
xn = xinv*q
167+
b = sin_sum(-x, -pi/4, -xn)
188168
return a * b
189169
end
190170
end
@@ -225,7 +205,7 @@ end
225205
# for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from
226206
# `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order.
227207
# For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1].
228-
#
208+
#
229209
# When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`.
230210
# When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)`
231211
# and Y_{n}(x) is calculated from the imaginary part of the Hankel function.
@@ -234,20 +214,20 @@ end
234214
#
235215
# For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down
236216
# to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
237-
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
217+
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
238218
# without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down.
239219
# When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence.
240-
#
220+
#
241221
# [1] https://dlmf.nist.gov/10.2#E3
242-
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
222+
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
243223
# Advances in Computational Mathematics 45.1 (2019): 173-211.
244-
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
224+
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
245225
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
246-
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
226+
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
247227
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
248-
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
228+
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
249229
# Computer physics communications 76.3 (1993): 381-388.
250-
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
230+
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
251231
# Vol. 55. US Government printing office, 1964.
252232
#
253233

@@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x
389369
"""
390370
function bessely_positive_args(nu, x::T) where T
391371
iszero(x) && return -T(Inf)
392-
372+
393373
# use forward recurrence if nu is an integer up until it becomes inefficient
394374
(isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1]
395375

@@ -418,7 +398,7 @@ end
418398
# combined to calculate both J_v and J_{-v} in the same loop
419399
# J_{-v} always converges slower so just check that convergence
420400
# Combining the loop was faster than using two separate loops
421-
#
401+
#
422402
# this works well for small arguments x < 7.0 for rel. error ~1e-14
423403
# this also works well for nu > 1.35x - 4.5
424404
# for nu > 25 more cancellation occurs near integer values
@@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T
575555
logout2 = -v * log(x/2) + log(out2)
576556
577557
spi, cpi = sincospi(v)
578-
558+
579559
#tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi))
580560
tmp = logout + log((-cpi + exp(logout2) - logout) / spi)
581561

src/misc.jl

Lines changed: 78 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,92 @@
1-
# function to more accurately compute cos(x + xn)
2-
# see https://github.com/heltonmc/Bessels.jl/pull/13
3-
# written by @oscardssmith
4-
function cos_sum(x, xn)
5-
s = x + xn
6-
n, r = reduce_pi02_med(s)
7-
lo = r.lo - ((s - x) - xn)
8-
hi = r.hi + lo
9-
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
10-
n = n&3
1+
using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32
2+
3+
"""
4+
computes sin(sum(xs)) where xs are sorted by absolute value
5+
Doing this is much more accurate than the naive sin(sum(xs))
6+
"""
7+
function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
8+
n, y = rem_pio2_sum(xs...)
9+
n &= 3
1110
if n == 0
12-
return Base.Math.cos_kernel(y)
11+
return sin_kernel(y)
1312
elseif n == 1
14-
return -Base.Math.sin_kernel(y)
13+
return cos_kernel(y)
1514
elseif n == 2
16-
return -Base.Math.cos_kernel(y)
15+
return -sin_kernel(y)
1716
else
18-
return Base.Math.sin_kernel(y)
17+
return -cos_kernel(y)
1918
end
2019
end
21-
# function to more accurately compute sin(x + xn)
22-
function sin_sum(x, xn)
23-
s = x + xn
24-
n, r = reduce_pi02_med(s)
25-
lo = r.lo - ((s - x) - xn)
26-
hi = r.hi + lo
27-
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
28-
n = n&3
20+
21+
"""
22+
computes sincos(sum(xs)) where xs are sorted by absolute value
23+
Doing this is much more accurate than the naive sincos(sum(xs))
24+
"""
25+
function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
26+
n, y = rem_pio2_sum(xs...)
27+
n &= 3
28+
si, co = sincos_kernel(y)
2929
if n == 0
30-
return Base.Math.sin_kernel(y)
30+
return si, co
3131
elseif n == 1
32-
return Base.Math.cos_kernel(y)
32+
return co, -si
3333
elseif n == 2
34-
return -Base.Math.sin_kernel(y)
34+
return -si, -co
3535
else
36-
return -Base.Math.cos_kernel(y)
36+
return -co, si
37+
end
38+
end
39+
40+
function rem_pio2_sum(xs::Vararg{Float64})
41+
n = 0
42+
hi, lo = 0.0, 0.0
43+
for x in xs
44+
if abs(x) <= pi/4
45+
s = x + hi
46+
lo += (x - (s - hi))
47+
else
48+
n_i, y = rem_pio2_kernel(x)
49+
n += n_i
50+
s = y.hi + hi
51+
lo += (y.hi - (s - hi)) + y.lo
52+
end
53+
hi = s
54+
end
55+
while hi > pi/4
56+
hi -= pi/2
57+
lo -= 6.123233995736766e-17
58+
n += 1
59+
end
60+
while hi < -pi/4
61+
hi += pi/2
62+
lo += 6.123233995736766e-17
63+
n -= 1
64+
end
65+
return n, DoubleFloat64(hi, lo)
66+
end
67+
68+
function rem_pio2_sum(xs::Vararg{Float32})
69+
y = 0.0
70+
n = 0
71+
# The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9
72+
# so reducing at 2^22 prevents catastrophic loss of precision.
73+
# There probably is a case where this loses some digits but it is a decent
74+
# tradeoff between accuracy and speed.
75+
@fastmath for x in xs
76+
if x > 0x1p22
77+
n_i, y_i = rem_pio2_kernel(Float32(x))
78+
n += n_i
79+
y += y_i.hi
80+
else
81+
y += x
82+
end
3783
end
84+
n_i, y = rem_pio2_kernel(y)
85+
return n + n_i, DoubleFloat32(y.hi)
3886
end
39-
@inline function reduce_pi02_med(x::Float64)
40-
pio2_1 = 1.57079632673412561417e+00
4187

42-
fn = round(x*(2/pi))
43-
r = muladd(-fn, pio2_1, x)
44-
w = fn * 6.07710050650619224932e-11
45-
y = r-w
46-
return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
88+
function rem_pio2_sum(xs::Vararg{Float16})
89+
y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64
90+
n, y = rem_pio2_kernel(y)
91+
return n, DoubleFloat32(y.hi)
4792
end

0 commit comments

Comments
 (0)